In [41]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import pickle

In [2]:
import os

# os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
os.environ["PYTHONWARNINGS"] = "default"

## Data preprocessing

In [3]:
from enoe import load_enoe

enoe = load_enoe()

y = enoe['informal']
X = enoe.drop(columns='informal')

In [4]:
# Base rate
y.value_counts().max()/len(y)

# We must aim for an accuracy significally better than 63%

0.6241807519834426

## Hyperparameter tunning

In [5]:
from models_grids import models_LR, models_RF, models_GB, models_SVC, models_KNN

In [15]:
models = models_LR | models_RF | models_GB | models_SVC | models_KNN

In [None]:
%%time

cv_inner = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

outer_folds = {}
test_score = []
for fold, (train_index, test_index) in enumerate(cv_outer.split(X, y)):
    print(f"Running outer fold {fold}")
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]
    
    grids = {}
    for key in models.keys():
        print(f"Running GridSearchCV for {key}")
    
        model = models[key]['model']
        param_grid = models[key]['params']
    
        gs = GridSearchCV(model, param_grid, cv=cv_inner, n_jobs=50, scoring='balanced_accuracy', verbose=1)
        gs.fit(X_train, y_train)

        grids[key] = gs
    outer_folds[fold] = grids

In [42]:
df_list = []
for fold, grids in outer_folds.items():
    for model, grid in grids.items():
        df_list.append(
            pd.DataFrame(grid.cv_results_).assign(model=model, fold=fold)
        )
    # grids_df = pd.concat([pd.DataFrame(grid.cv_results_).assign(model=model) for model, grid in grids.items()]).reset_index(drop=True).assign(fold=fold)
cv_results = pd.concat(df_list).reset_index(drop=True)
cv_results['model_type'] = cv_results.model.str.split('_', expand=True)[0]
cv_results.to_pickle('cv_results.pkl')

In [39]:
cv_results.drop(columns=['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score'])

Unnamed: 0,param_classifier__C,params,mean_test_score,std_test_score,rank_test_score,model,fold,param_preprocessor__discretizer__n_bins,param_classifier__max_features,param_classifier__max_leaf_nodes,param_classifier__learning_rate,param_classifier__gamma,param_classifier__n_neighbors,model_type
0,0.0001,{'classifier__C': 0.0001},0.500000,0.000000,10,LR_ed_scl,0,,,,,,,LR
1,0.000774,{'classifier__C': 0.000774263682681127},0.593621,0.015833,9,LR_ed_scl,0,,,,,,,LR
2,0.005995,{'classifier__C': 0.005994842503189409},0.714600,0.019264,8,LR_ed_scl,0,,,,,,,LR
3,0.046416,{'classifier__C': 0.046415888336127774},0.763145,0.024814,7,LR_ed_scl,0,,,,,,,LR
4,0.359381,{'classifier__C': 0.3593813663804626},0.768480,0.026823,6,LR_ed_scl,0,,,,,,,LR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5895,,{'classifier__n_neighbors': 6},0.732306,0.017844,6,KNN_ed_cnss,9,,,,,,6,KNN
5896,,{'classifier__n_neighbors': 7},0.745510,0.015289,2,KNN_ed_cnss,9,,,,,,7,KNN
5897,,{'classifier__n_neighbors': 8},0.733797,0.020343,4,KNN_ed_cnss,9,,,,,,8,KNN
5898,,{'classifier__n_neighbors': 9},0.748944,0.016279,1,KNN_ed_cnss,9,,,,,,9,KNN


In [73]:
# Score best model by type for each fold
idx = cv_results.groupby(['model_type', 'fold']).mean_test_score.idxmax()
best_models = cv_results.loc[idx][['fold', 'model_type', 'model', 'params', 'mean_test_score', 'std_test_score']].set_index(['fold', 'model_type']).sort_index()
best_models = best_models.assign(accuracy=0.0, balanced_accuracy=0.0)
best_models

Unnamed: 0_level_0,Unnamed: 1_level_0,model,params,mean_test_score,std_test_score,accuracy,balanced_accuracy
fold,model_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,GB,GB_ed_num,{'classifier__learning_rate': 0.04641588833612...,0.788274,0.021098,0.0,0.0
0,KNN,KNN_ed_cnss,{'classifier__n_neighbors': 9},0.757215,0.021432,0.0,0.0
0,LR,LR_ed_cnss,{'classifier__C': 21.54434690031882},0.783478,0.026902,0.0,0.0
0,RF,RF_ed_num,"{'classifier__max_features': 3, 'classifier__m...",0.784395,0.021398,0.0,0.0
0,SVC,SVC_ed_cnss,"{'classifier__C': 0.3593813663804626, 'classif...",0.786603,0.021676,0.0,0.0
1,GB,GB_ed_num,{'classifier__learning_rate': 0.04641588833612...,0.78354,0.011128,0.0,0.0
1,KNN,KNN_ed_cnss,{'classifier__n_neighbors': 9},0.744357,0.009049,0.0,0.0
1,LR,LR_ed_cnss,{'classifier__C': 166.81005372000558},0.77451,0.008853,0.0,0.0
1,RF,RF_ed_num,"{'classifier__max_features': 3, 'classifier__m...",0.781745,0.009364,0.0,0.0
1,SVC,SVC_ed_cnss,"{'classifier__C': 21.54434690031882, 'classifi...",0.783774,0.012044,0.0,0.0


In [58]:
for (fold, model_type), row in best_models.iterrows():
    model = outer_folds[fold][row.model]

In [59]:
best_models.loc[fold].model

model_type
GB       GB_ed_num
KNN    KNN_ed_cnss
LR      LR_ed_cnss
RF      RF_ed_cnss
SVC    SVC_ed_cnss
Name: model, dtype: object

In [63]:
outer_folds[9]['KNN_ed_cnss'].predict(X_test);

In [74]:
for fold, (train_index, test_index) in enumerate(cv_outer.split(X, y)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]

    for model in best_models.loc[fold].model:
        y_pred = outer_folds[fold][model].predict(X_test)
        # accuracy, balanced_acuracy = accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)
        best_models.loc[(fold, model.split('_')[0]), ['accuracy', 'balanced_accuracy']] = accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

In [81]:
best_models = best_models.groupby('model_type')[['balanced_accuracy', 'accuracy']].agg(['mean', 'std'])
best_models

Unnamed: 0_level_0,balanced_accuracy,balanced_accuracy,accuracy,accuracy
Unnamed: 0_level_1,mean,std,mean,std
model_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
GB,0.781845,0.019323,0.805278,0.018363
KNN,0.745368,0.016933,0.773887,0.0159
LR,0.777862,0.017539,0.80407,0.016469
RF,0.777117,0.019864,0.801658,0.019093
SVC,0.780146,0.019625,0.804072,0.019313


In [82]:
best_models.to_csv('best_models.csv')

## Load OD survey

In [7]:
od = pd.read_csv('datos_limpios_tiempos.csv', low_memory=False)

In [9]:
# Creation of OD survey

od2 = (
    od.query("Motivo == 'trabajo'")
    .groupby('H-P').first()
)

od2

Unnamed: 0_level_0,ID-HOGAR,H-P-V,Latitud,Longitud,FechaHoraEnc,NumVisita,TipoEnc,RealizoEnc,Encuestador,Supervisor,...,Modo Agrupado,motivos,genero,estudios,disc,origen,Tiempo,tiempo_s,tiempo_m,tiempo_h
H-P,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000002-14/1,000002-14,000002-14/1-1,25.798995,-100.04896,2019-11-07T00:00:00Z,Visita 1,HOGAR,Sí,Karla Guadalupe Torreones Perez,Erika Gutierrez Lozano,...,TPUB,1,F,Bajo,No,NA+NA,,2100,35,0.583333
000002-2/1,000002-2,000002-2/1-3,25.796132,-100.04596,2019-11-07T00:00:00Z,Visita 1,VIAJE,Sí,Karla Guadalupe Torreones Perez,Erika Gutierrez Lozano,...,Bicicleta,1,H,Bajo,No,NA+NA,,1800,30,0.500000
000002-2/3,000002-2,000002-2/3-1,25.796132,-100.04596,2019-11-07T00:00:00Z,Visita 1,HABITANTE,,Karla Guadalupe Torreones Perez,Erika Gutierrez Lozano,...,caminando,1,H,Bajo,No,NA+NA,,900,15,0.250000
000011-12/1,000011-12,000011-12/1-1,25.795688,-100.03989,2019-11-07T00:00:00Z,Visita 1,HOGAR,Sí,Yesica Karina Maldonado Martinez,Erika Gutierrez Lozano,...,caminando,1,H,Bajo,No,NA+NA,,600,10,0.166667
000011-8/1,000011-8,000011-8/1-1,25.799000,-100.04100,2019-11-07T00:00:00Z,Visita 1,HOGAR,Sí,Yesica Karina Maldonado Martinez,Erika Gutierrez Lozano,...,TPUB,1,H,Bajo,No,NA+NA,,1800,30,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m9766-4/1,m9766-4,m9766-4/1-1,25.662725,-100.27108,2019-10-29T00:00:00Z,Visita 1,HOGAR,Sí,Juliana Araujo Torres,María San Juana Galván Alvarez,...,automovil,1,H,Bajo,No,NA+NA,,600,10,0.166667
m9766-4/3,m9766-4,m9766-4/3-3,25.662725,-100.27108,2019-10-29T00:00:00Z,Visita 1,VIAJE,,Juliana Araujo Torres,María San Juana Galván Alvarez,...,automovil,1,F,Alto,No,NA+NA,,600,10,0.166667
m9766-6/3,m9766-6,m9766-6/3-1,25.662570,-100.27124,2019-10-29T00:00:00Z,Visita 1,HABITANTE,,Juliana Araujo Torres,María San Juana Galván Alvarez,...,automovil,1,F,Bajo,No,NA+NA,,600,10,0.166667
m9766-6/4,m9766-6,m9766-6/4-1,25.662570,-100.27124,2019-10-29T00:00:00Z,Visita 1,HABITANTE,,Juliana Araujo Torres,María San Juana Galván Alvarez,...,automovil,1,H,Bajo,No,NA+NA,,1800,30,0.500000
