In [1]:
from typing import Any, Callable, Dict, List, Tuple, Union

import pandas as pd
import numpy as np

from fangorn.files_prep import get_data, data_to_pandas
from fangorn.preprocessing import splitting
from fangorn.training import classifiers

In [4]:
def run_ml(model: str, train_dict: Dict[str, Any]) ->  Dict[str, Any]:
    """
    Run ml models returning the log dict of each one
    """
    if model=='xgb':
        resulted_classifier = classifiers.xgb_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
        
    elif model == 'lgbm':
        resulted_classifier = classifiers.lgbm_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
        
    elif model == 'rf':
        resulted_classifier = classifiers.random_forest_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
    elif model == 'logit':
        resulted_classifier = classifiers.logistic_regression_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
    
    return resulted_classifier

In [3]:
def generate_report(dataset:str, model: str, train_dict: Dict[str, Any], resulted_classifier) -> pd.DataFrame:
    """
    Create a dataframe for report
    """
    df_report = pd.DataFrame()
    df_report['dataset'] = [f'{dataset}']
    df_report['model'] = f'{model}_baseline'
    
    df_report['train_shape'] = [train_dict['train']['X'].shape]
    df_report['fit_time'] = '%.3f' % resulted_classifier['time_elapsed']['fit']
    
    df_report['test_shape'] = [train_dict['test']['X'].shape]
    df_report['predict_time'] = '%.3f' % resulted_classifier['time_elapsed']['predict']
    
    # metrics report
    for key,value in resulted_classifier['calc_metrics'].items():
        df_report[key] = value 

    return df_report

In [5]:
dataset='christine'
model = 'logit'
X_all, y_all = data_to_pandas.read_prepare_data(dataset)
train_dict = splitting.simple_train_test_val_split(X_all, y_all)

In [6]:
resulted_classifier = run_ml(model, train_dict)



In [7]:
resulted_classifier['calc_metrics']

{'acc': 0.6171586715867159,
 'precision': 0.617022073341776,
 'recall': 0.6171916283414529,
 'f1': 0.6081208687440982,
 'auc': 0.6671382741892045}

In [15]:
all_datasets = get_data.get_all_data(only='ml_challenge')
all_models = ['xgb', 'lgbm', 'rf']
baseline_df = pd.DataFrame()

for dataset in all_datasets:
    print(f"working in {dataset}")
    X_all, y_all = data_to_pandas.read_prepare_data(dataset)
    train_dict = splitting.simple_train_test_val_split(X_all, y_all)
    for model in all_models:
        print(f"\t working in {model}")
        resulted_classifier = run_ml(model, train_dict)
        # creating a dataframe with results
        df_tmp = generate_report(dataset, model, train_dict, resulted_classifier)
        baseline_df = baseline_df.append(df_tmp)

<configparser.ConfigParser object at 0x7f234a615710>
All ML_CHALLENGE files ready!
working in christine
	 working in xgb
	 working in lgbm
	 working in rf
working in jasmine
	 working in xgb
	 working in lgbm
	 working in rf
working in philippine
	 working in xgb
	 working in lgbm
	 working in rf
working in madeline
	 working in xgb
	 working in lgbm
	 working in rf
working in sylvine
	 working in xgb
	 working in lgbm
	 working in rf


In [17]:
baseline_df.to_excel('baseline_classifier.xlsx', index=False)