In [1]:
from typing import Any, Callable, Dict, List, Tuple, Union

import pandas as pd
import numpy as np

from fangorn.files_prep import get_data, data_to_pandas
from fangorn.preprocessing import splitting, feature_selection
from fangorn.training import classifiers


In [2]:
def run_ml(model: str, train_dict: Dict[str, Any]) ->  Dict[str, Any]:
    """
    Run ml models returning the log dict of each one
    """
    if model=='xgb':
        resulted_classifier = classifiers.xgb_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
        
    elif model == 'lgbm':
        resulted_classifier = classifiers.lgbm_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
        
    elif model == 'rf':
        resulted_classifier = classifiers.random_forest_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
    elif model == 'logit':
        resulted_classifier = classifiers.logistic_regression_classifier(
                train_set = [train_dict['train']['X'], train_dict['train']['y']],
                test_set = [train_dict['test']['X'], train_dict['test']['y']],
                features = train_dict['test']['X'].columns,
                target = train_dict['test']['y'].columns[0],
                test_metrics = ['acc', 'precision', 'recall', 'f1','auc']
            )
    
    return resulted_classifier

def generate_report(dataset:str, model: str, train_dict: Dict[str, Any], resulted_classifier) -> pd.DataFrame:
    """
    Create a dataframe for report
    """
    df_report = pd.DataFrame()
    df_report['dataset'] = [f'{dataset}']
    df_report['model'] = f'{model}_baseline'
    
    df_report['train_shape_rows'] = [train_dict['train']['X'].shape[0]]
    df_report['train_shape_cols'] = [train_dict['train']['X'].shape[1]]
    df_report['fit_time'] = float('%.3f' % resulted_classifier['time_elapsed']['fit'])
    
    df_report['test_shape_rows'] = [train_dict['test']['X'].shape[0]]
    df_report['test_shape_cols'] = [train_dict['train']['X'].shape[1]]
    df_report['predict_time'] = float('%.3f' % resulted_classifier['time_elapsed']['predict'])
    
    # metrics report
    for key,value in resulted_classifier['calc_metrics'].items():
        df_report[key] = value 

    return df_report

In [None]:
def run_holdout_baseline():
    all_datasets = get_data.get_all_data(only='ml_challenge')
    all_models = ['xgb', 'lgbm', 'rf', 'logit']
    baseline_df = pd.DataFrame()

    for dataset in all_datasets:
        print(f"working in {dataset}")
        X_all, y_all = data_to_pandas.read_prepare_data(dataset)
        train_dict = splitting.simple_train_test_val_split(X_all, y_all)
        for model in all_models:
            print(f"\t working in {model}")
            resulted_classifier = run_ml(model, train_dict)
            # creating a dataframe with results
            df_tmp = generate_report(dataset, model, train_dict, resulted_classifier)
            baseline_df = baseline_df.append(df_tmp)
    return baseline_df

run_holdout_baseline().to_excel('baseline_classifier.xlsx', index=False)

All ML_CHALLENGE files ready!
working in christine
	 working in xgb
	 working in lgbm
	 working in rf
	 working in logit


In [3]:
def run_kfold_baseline(select_features: bool = False):
    all_datasets = get_data.get_all_data(only='ml_challenge')
    all_models = ['xgb', 'lgbm', 'rf']
    baseline_df = pd.DataFrame()

    for dataset in all_datasets:
        print(f"working in {dataset}")
        X_all, y_all = data_to_pandas.read_prepare_data(dataset)
        if select_features:
            print("Simple feature selection")
            X_all_old_shape = X_all.shape[1]
            X_all = feature_selection.extra_trees_feature_selection(X_all, y_all)
            print(f"Feature space (old x new): {X_all_old_shape} x {X_all.shape[1]}")
        all_folds = splitting.stratified_kfold_train_test_split(X_all, y_all)
        for model in all_models:
            print(f"\t working in {model}")
            fold_results = []
            for fold in all_folds:
                # para cada fold, treina, e gera um report
                tmp_resulted_classifier = run_ml(model, fold)
                fold_results.append(generate_report(dataset, model, fold, tmp_resulted_classifier))
            # tira a media dos resultados dos folds e agrupa no df baseline final
            df_tmp = pd.concat(fold_results).groupby(['dataset', 'model'], as_index=False).mean()
            baseline_df = baseline_df.append(df_tmp)

    return baseline_df

In [7]:
run_kfold_baseline(select_features=False).to_excel('baselin'e_classifier_kfold_.xlsx', index=False)

All ML_CHALLENGE files ready!
working in christine
	 working in xgb
	 working in lgbm
	 working in rf
working in jasmine
	 working in xgb
	 working in lgbm
	 working in rf
working in philippine
	 working in xgb
	 working in lgbm
	 working in rf
working in madeline
	 working in xgb
	 working in lgbm
	 working in rf
working in sylvine
	 working in xgb
	 working in lgbm
	 working in rf


In [5]:
run_kfold_baseline(select_features=True).to_excel('baseline_classifier_kfold_SIMPLE_ET_FS.xlsx', index=False)

All ML_CHALLENGE files ready!
working in christine
Simple feature selection
Feature space (old x new): 1636 x 462
	 working in xgb
	 working in lgbm
	 working in rf
working in jasmine
Simple feature selection
Feature space (old x new): 144 x 50
	 working in xgb
	 working in lgbm
	 working in rf
working in philippine
Simple feature selection
Feature space (old x new): 308 x 68
	 working in xgb
	 working in lgbm
	 working in rf
working in madeline
Simple feature selection
Feature space (old x new): 259 x 32
	 working in xgb
	 working in lgbm
	 working in rf
working in sylvine
Simple feature selection
Feature space (old x new): 20 x 3
	 working in xgb
	 working in lgbm
	 working in rf
