#### Creación de la muestra de entrenamiento y testing con sklearn para Up Sampling usando las mejores variables y CV Grid Search en XGBoost. Se usa la misma técnica usada por Juan en la cual tuvo el mejor desempeño. Se obtendrá el XGB y luego se le aplicará un Grid Search (tuneo)

In [None]:
# Importación de librerías

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import missingno as msng
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

df = pd.read_csv('dataset_SCL_2.csv')

#### Up Sampling

In [None]:
df['atraso_15'].value_counts()

0    55592
1    12614
Name: atraso_15, dtype: int64

In [None]:
#Resample
from sklearn.utils import resample

data_no_retraso = df[df['atraso_15'] == 0]
data_atraso = df[df['atraso_15'] == 1]

data_atraso_upsampled = resample(data_atraso, 
                                 replace = True,     # sample with replacement
                                 n_samples = 30000,    # to match majority class
                                 random_state = 42) # reproducible results

data_upsampled = pd.concat([data_no_retraso, data_atraso_upsampled])

In [None]:
data_upsampled

Unnamed: 0.1,Unnamed: 0,OPERA,MES,TIPOVUELO,SIGLADES,DIANOM,temporada_alta,periodo_dia,atraso_15,C_J_destinos,C_J_aerolineas
0,42405,Grupo LATAM,8,N,Antofagasta,Domingo,0,noche,0,2,1
1,65490,Grupo LATAM,12,N,Puerto Montt,Miercoles,1,mañana,0,2,1
3,8036,Sky Airline,2,N,Iquique,Viernes,1,noche,0,2,1
4,41039,Grupo LATAM,8,N,Antofagasta,Viernes,0,tarde,0,2,1
6,67061,Grupo LATAM,12,I,Sao Paulo,Miercoles,1,tarde,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
20923,24107,Sky Airline,5,I,Buenos Aires,Martes,0,mañana,1,3,1
11905,16352,Grupo LATAM,3,I,Lima,Martes,0,tarde,1,3,1
53279,4834,Grupo LATAM,1,I,Buenos Aires,Jueves,1,noche,1,3,1
67078,11065,Grupo LATAM,2,I,Florianapolis,Martes,1,tarde,1,3,1


In [None]:
data_upsampled['atraso_15'].value_counts()

0    55592
1    30000
Name: atraso_15, dtype: int64

In [None]:
# El algoritmo que se utilizará será XGBoost, el cual requiere dicotomizar las variables categóricas

features_upsampled = pd.concat([pd.get_dummies(data_upsampled['OPERA'], prefix = 'OPERA'), pd.get_dummies(data_upsampled['MES'], prefix = 'MES'), pd.get_dummies(data_upsampled['TIPOVUELO'], prefix = 'TIPOVUELO'), 
                      pd.get_dummies(data_upsampled['SIGLADES'], prefix = 'SIGLADES'), pd.get_dummies(data_upsampled['DIANOM'], prefix = 'DIANOM'), pd.get_dummies(data_upsampled['periodo_dia'], prefix = 'periodo_dia'),
                      pd.get_dummies(data_upsampled['C_J_destinos'], prefix = 'C_J_destinos'), pd.get_dummies(data_upsampled['C_J_aerolineas'], prefix = 'C_J_aerolineas')], axis = 1)
label_upsampled = data_upsampled['atraso_15']

In [None]:
# Se eligen las mejores variables

dummies = features_upsampled[['MES_7', 'C_J_destinos_3', 'C_J_aerolineas_2', 'C_J_aerolineas_1', 'periodo_dia_mañana', 'C_J_destinos_1', 'MES_4', 'OPERA_Latin American Wings',
                                'MES_12', 'MES_3', 'OPERA_Grupo LATAM', 'OPERA_Aerolineas Argentinas', 'DIANOM_Martes', 'DIANOM_Jueves', 'SIGLADES_Antofagasta',
                                 'OPERA_Avianca', 'periodo_dia_tarde', 'OPERA_Aeromexico']]

#### Utilizando XGBoost

In [None]:
x_upsampled_train, x_upsampled_test, y_upsampled_train, y_upsampled_test = train_test_split(dummies, label_upsampled, test_size = 0.33, random_state = 42)

In [None]:
modelxgb_best_up = xgb.XGBClassifier(random_state=1, learning_rate=0.01, subsample = 1, max_depth = 10)
modelxgb_best_up.fit(x_upsampled_train, y_upsampled_train)

XGBClassifier(learning_rate=0.01, max_depth=10, random_state=1)

In [None]:
y_best_upsampled_predxgb = modelxgb_best_up.predict(x_upsampled_test)

#### Métricas XGBoost

In [None]:
confusion_matrix(y_upsampled_test, y_best_upsampled_predxgb)

array([[16678,  1671],
       [ 7394,  2503]])

In [None]:
print(classification_report(y_upsampled_test, y_best_upsampled_predxgb))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79     18349
           1       0.60      0.25      0.36      9897

    accuracy                           0.68     28246
   macro avg       0.65      0.58      0.57     28246
weighted avg       0.66      0.68      0.64     28246



### Grid Search CV sobre Modelo XG Boost Up Sampling y con los mejores features

In [None]:
parameters = {
                'learning_rate': [0.01,0.05, 0.1],
                'n_estimators': [50, 100, 150],
                'subsample': [0.5, 0.9]
}

modelxgb_best_up_GridCV = GridSearchCV(modelxgb_best_up, param_grid = parameters,
    cv = 2, n_jobs=-1, verbose=1).fit(x_upsampled_train, y_upsampled_train)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


In [None]:
y_predxgb_best_up_GridCV = modelxgb_best_up_GridCV.predict(x_upsampled_test)

#### Métricas XGBoost

In [None]:
confusion_matrix(y_upsampled_test, y_predxgb_best_up_GridCV)

array([[16691,  1658],
       [ 7388,  2509]])

In [None]:
print(classification_report(y_upsampled_test, y_predxgb_best_up_GridCV))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79     18349
           1       0.60      0.25      0.36      9897

    accuracy                           0.68     28246
   macro avg       0.65      0.58      0.57     28246
weighted avg       0.66      0.68      0.64     28246



### Este modelo tiene un Accuracy del 68%, una Sensitivity del 60% y una Precision para la clase 1 del 25%. No se han logrado mejoras, salvo una mejor Accuracy.