#### Creación de la muestra de entrenamiento y testing con sklearn para Up Sampling usando las mejores variables, Random Forest (Bosques Aleatorios) e hiperparámetros con Grid Search.

In [None]:
# Importación de librerías

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

import missingno as msng
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

df = pd.read_csv('dataset_SCL_2.csv')

#### Up Sampling

In [None]:
df['atraso_15'].value_counts()

0    55592
1    12614
Name: atraso_15, dtype: int64

In [None]:
#Resample
from sklearn.utils import resample

data_no_retraso = df[df['atraso_15'] == 0]
data_atraso = df[df['atraso_15'] == 1]

data_atraso_upsampled = resample(data_atraso, 
                                 replace = True,     # sample with replacement
                                 n_samples = 30000,    # to match majority class
                                 random_state = 42) # reproducible results

data_upsampled = pd.concat([data_no_retraso, data_atraso_upsampled])

In [None]:
data_upsampled

Unnamed: 0.1,Unnamed: 0,OPERA,MES,TIPOVUELO,SIGLADES,DIANOM,temporada_alta,periodo_dia,atraso_15,C_J_destinos,C_J_aerolineas
0,42405,Grupo LATAM,8,N,Antofagasta,Domingo,0,noche,0,2,1
1,65490,Grupo LATAM,12,N,Puerto Montt,Miercoles,1,mañana,0,2,1
3,8036,Sky Airline,2,N,Iquique,Viernes,1,noche,0,2,1
4,41039,Grupo LATAM,8,N,Antofagasta,Viernes,0,tarde,0,2,1
6,67061,Grupo LATAM,12,I,Sao Paulo,Miercoles,1,tarde,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
20923,24107,Sky Airline,5,I,Buenos Aires,Martes,0,mañana,1,3,1
11905,16352,Grupo LATAM,3,I,Lima,Martes,0,tarde,1,3,1
53279,4834,Grupo LATAM,1,I,Buenos Aires,Jueves,1,noche,1,3,1
67078,11065,Grupo LATAM,2,I,Florianapolis,Martes,1,tarde,1,3,1


In [None]:
data_upsampled['atraso_15'].value_counts()

0    55592
1    30000
Name: atraso_15, dtype: int64

In [None]:
# El algoritmo que se utilizará será Random Forest, el cual requiere dicotomizar las variables categóricas

features_upsampled = pd.concat([pd.get_dummies(data_upsampled['OPERA'], prefix = 'OPERA'), pd.get_dummies(data_upsampled['MES'], prefix = 'MES'), pd.get_dummies(data_upsampled['TIPOVUELO'], prefix = 'TIPOVUELO'), 
                      pd.get_dummies(data_upsampled['SIGLADES'], prefix = 'SIGLADES'), pd.get_dummies(data_upsampled['DIANOM'], prefix = 'DIANOM'), pd.get_dummies(data_upsampled['periodo_dia'], prefix = 'periodo_dia'),
                      pd.get_dummies(data_upsampled['C_J_destinos'], prefix = 'C_J_destinos'), pd.get_dummies(data_upsampled['C_J_aerolineas'], prefix = 'C_J_aerolineas')], axis = 1)
label_upsampled = data_upsampled['atraso_15']

In [None]:
# Se eligen las mejores variables

dummies = features_upsampled[['MES_7', 'C_J_destinos_3', 'C_J_aerolineas_2', 'C_J_aerolineas_1', 'periodo_dia_mañana', 'C_J_destinos_1', 'MES_4', 'OPERA_Latin American Wings',
                                'MES_12', 'MES_3', 'OPERA_Grupo LATAM', 'OPERA_Aerolineas Argentinas', 'DIANOM_Martes', 'DIANOM_Jueves', 'SIGLADES_Antofagasta',
                                 'OPERA_Avianca', 'periodo_dia_tarde', 'OPERA_Aeromexico']]

In [None]:
dummies.shape

(85592, 18)

#### Utilizando Random Forest

In [None]:
x_upsampled_train, x_upsampled_test, y_upsampled_train, y_upsampled_test = train_test_split(dummies, label_upsampled, test_size = 0.33, random_state = 42)

In [None]:
modelrf_best_up = RandomForestClassifier(n_estimators = 500, 
                                  random_state = 2016,
                                  min_samples_leaf = 10,)

modelrf_best_up.fit(x_upsampled_train, y_upsampled_train)

RandomForestClassifier(min_samples_leaf=10, n_estimators=500, random_state=2016)

In [None]:
y_best_upsampled_predrf = modelrf_best_up.predict(x_upsampled_test)

#### Métricas Random Forest

In [None]:
confusion_matrix(y_upsampled_test, y_best_upsampled_predrf)

array([[16715,  1634],
       [ 7429,  2468]])

In [None]:
print(classification_report(y_upsampled_test, y_best_upsampled_predrf))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79     18349
           1       0.60      0.25      0.35      9897

    accuracy                           0.68     28246
   macro avg       0.65      0.58      0.57     28246
weighted avg       0.66      0.68      0.63     28246



#### Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

max_features_range = np.arange(1,19,1)
n_estimators_range = np.arange(10,310,10)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

In [None]:
grid.fit(x_upsampled_train, y_upsampled_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18]),
                         'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300])})

In [None]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'max_features': 5, 'n_estimators': 180} with a score of 0.69


Exportamos los parámetros del grid search

In [None]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head(10)

Unnamed: 0,max_features,n_estimators,Accuracy
0,1,10,0.686813
1,1,20,0.686465
2,1,30,0.686866
3,1,40,0.686953
4,1,50,0.687005
5,1,60,0.687389
6,1,70,0.687214
7,1,80,0.687005
8,1,90,0.687319
9,1,100,0.687057


Preparar los datos para graficarlos

In [None]:
grid_contour = grid_results.groupby(['max_features','n_estimators']).mean()
grid_contour

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
max_features,n_estimators,Unnamed: 2_level_1
1,10,0.686813
1,20,0.686465
1,30,0.686866
1,40,0.686953
1,50,0.687005
...,...,...
18,260,0.687354
18,270,0.687197
18,280,0.686709
18,290,0.686848


In [None]:
grid_reset = grid_contour.reset_index()
grid_reset.columns = ['max_features', 'n_estimators', 'Accuracy']
grid_pivot = grid_reset.pivot('max_features', 'n_estimators')
grid_pivot

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
n_estimators,10,20,30,40,50,60,70,80,90,100,...,210,220,230,240,250,260,270,280,290,300
max_features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,0.686813,0.686465,0.686866,0.686953,0.687005,0.687389,0.687214,0.687005,0.687319,0.687057,...,0.687197,0.687389,0.687127,0.687092,0.687511,0.687581,0.687232,0.68718,0.68711,0.686918
2,0.686203,0.686412,0.687023,0.687092,0.687336,0.686761,0.687057,0.687232,0.686831,0.687284,...,0.686761,0.68697,0.687023,0.687145,0.68718,0.687459,0.687197,0.687127,0.686866,0.687284
3,0.686656,0.687249,0.686918,0.687075,0.68711,0.686639,0.686342,0.687057,0.686883,0.686988,...,0.68697,0.6869,0.68711,0.686621,0.687371,0.686918,0.686953,0.68704,0.687336,0.687389
4,0.686901,0.68711,0.687092,0.687459,0.6869,0.687389,0.687267,0.687354,0.687336,0.686848,...,0.686883,0.687162,0.687127,0.68704,0.68704,0.687162,0.687511,0.686813,0.686866,0.687023
5,0.686063,0.686639,0.686744,0.686883,0.68711,0.687162,0.686499,0.687459,0.68718,0.686761,...,0.687633,0.687197,0.687459,0.687633,0.687075,0.686761,0.687197,0.686813,0.687075,0.686918
6,0.686011,0.687284,0.687354,0.686761,0.686726,0.687023,0.687145,0.68718,0.686744,0.687354,...,0.68718,0.687249,0.686761,0.686866,0.68704,0.687354,0.686883,0.687371,0.68697,0.687075
7,0.686482,0.686273,0.686709,0.686953,0.687075,0.687127,0.687057,0.68697,0.687127,0.686726,...,0.687249,0.687197,0.687127,0.686674,0.686988,0.686866,0.686935,0.686918,0.687005,0.687232
8,0.686499,0.686813,0.686796,0.686848,0.686953,0.687145,0.686848,0.68704,0.686778,0.687371,...,0.687493,0.687633,0.687075,0.686848,0.68718,0.686778,0.687249,0.687162,0.687127,0.686953
9,0.686726,0.6869,0.687371,0.686796,0.686901,0.687354,0.687145,0.686866,0.687075,0.68704,...,0.687092,0.686726,0.686918,0.687371,0.68697,0.687023,0.687389,0.686848,0.687092,0.68704
10,0.68643,0.687197,0.687214,0.687302,0.68643,0.686935,0.687302,0.686988,0.687075,0.687092,...,0.68718,0.686866,0.687267,0.687267,0.686988,0.686587,0.687197,0.68711,0.686691,0.68711


In [None]:
x = grid_pivot.columns.levels[1].values
y = grid_pivot.index.values
z = grid_pivot.values

#### 2D Contour Plot

In [None]:
import plotly.graph_objects as go

# X and Y axes labels
layout = go.Layout(
            xaxis=go.layout.XAxis(
              title=go.layout.xaxis.Title(
              text='n_estimators')
             ),
             yaxis=go.layout.YAxis(
              title=go.layout.yaxis.Title(
              text='max_features') 
            ) )

fig = go.Figure(data = [go.Contour(z=z, x=x, y=y)], layout=layout )

fig.update_layout(title='Hyperparameter tuning', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

#### 3D Surface Plot

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
fig.update_layout(title='Hyperparameter tuning',
                  scene = dict(
                    xaxis_title='n_estimators',
                    yaxis_title='max_features',
                    zaxis_title='Accuracy'),
                  autosize=False,
                  width=800, height=800,
                  margin=dict(l=65, r=50, b=65, t=90))
fig.show()

#### Utilizando Random Forest Best Parameters
### OJO toma casi 2 horas el Grid Search.

In [None]:
x_upsampled_train2, x_upsampled_test2, y_upsampled_train2, y_upsampled_test2 = train_test_split(dummies, label_upsampled, test_size = 0.33, random_state = 42)

In [None]:
modelrf_best_up_grid = RandomForestClassifier(n_estimators = 180, 
                                  random_state = 123,
                                  max_features = 5)

modelrf_best_up_grid.fit(x_upsampled_train2, y_upsampled_train2)

RandomForestClassifier(max_features=5, n_estimators=180, random_state=123)

In [None]:
y_best_upsampled_grid_predrf = modelrf_best_up_grid.predict(x_upsampled_test2)

#### Métricas Random Forest

In [None]:
confusion_matrix(y_upsampled_test2, y_best_upsampled_grid_predrf)

array([[16651,  1698],
       [ 7366,  2531]])

In [None]:
print(classification_report(y_upsampled_test2, y_best_upsampled_grid_predrf))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79     18349
           1       0.60      0.26      0.36      9897

    accuracy                           0.68     28246
   macro avg       0.65      0.58      0.57     28246
weighted avg       0.66      0.68      0.64     28246



### Este modelo tiene un Accuracy del 68%, una Sensitivity del 60% y una Precision para la clase 1 del 26%. Toma casi 2 horas la espera del tuneo.