#### Creación de la muestra de entrenamiento y testing con sklearn para Equal Size Sampling, uso de las mejores variables y CV Grid Search

In [None]:
# Importación de librerías

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

import missingno as msng
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

df = pd.read_csv('dataset_SCL_2.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,OPERA,MES,TIPOVUELO,SIGLADES,DIANOM,temporada_alta,periodo_dia,atraso_15,C_J_destinos,C_J_aerolineas
0,42405,Grupo LATAM,8,N,Antofagasta,Domingo,0,noche,0,2,1
1,65490,Grupo LATAM,12,N,Puerto Montt,Miercoles,1,mañana,0,2,1
2,37211,Grupo LATAM,7,I,Rosario,Sabado,1,mañana,1,3,1
3,8036,Sky Airline,2,N,Iquique,Viernes,1,noche,0,2,1
4,41039,Grupo LATAM,8,N,Antofagasta,Viernes,0,tarde,0,2,1


In [None]:
# Se extrae la variable dependiente a un nuevo dataframe

atraso_15 = df['atraso_15']
atraso_15

0        0
1        0
2        1
3        0
4        0
        ..
68201    0
68202    0
68203    0
68204    0
68205    0
Name: atraso_15, Length: 68206, dtype: int64

In [None]:
# El algoritmo que se utilizará será XGBoost, el cual requiere dicotomizar las variables categóricas

dummies = pd.concat([pd.get_dummies(df['OPERA'], prefix = 'OPERA'), pd.get_dummies(df['MES'], prefix = 'MES'), pd.get_dummies(df['TIPOVUELO'], prefix = 'TIPOVUELO'), 
                      pd.get_dummies(df['SIGLADES'], prefix = 'SIGLADES'), pd.get_dummies(df['DIANOM'], prefix = 'DIANOM'), pd.get_dummies(df['periodo_dia'], prefix = 'periodo_dia'),
                      pd.get_dummies(df['C_J_destinos'], prefix = 'C_J_destinos'), pd.get_dummies(df['C_J_aerolineas'], prefix = 'C_J_aerolineas')], axis = 1)

In [None]:
dummies.head()

Unnamed: 0,OPERA_Aerolineas Argentinas,OPERA_Aeromexico,OPERA_Air Canada,OPERA_Air France,OPERA_Alitalia,OPERA_American Airlines,OPERA_Austral,OPERA_Avianca,OPERA_British Airways,OPERA_Copa Air,...,periodo_dia_noche,periodo_dia_tarde,C_J_destinos_1,C_J_destinos_2,C_J_destinos_3,C_J_aerolineas_1,C_J_aerolineas_2,C_J_aerolineas_3,C_J_aerolineas_4,C_J_aerolineas_5
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,0


In [None]:
# Se eligen las mejores variables

dummies = dummies[['MES_7', 'C_J_destinos_3', 'C_J_aerolineas_2', 'C_J_aerolineas_1', 'periodo_dia_mañana', 'C_J_destinos_1', 'MES_4', 'OPERA_Latin American Wings',
                                'MES_12', 'MES_3', 'OPERA_Grupo LATAM', 'OPERA_Aerolineas Argentinas', 'DIANOM_Martes', 'DIANOM_Jueves', 'SIGLADES_Antofagasta',
                                 'OPERA_Avianca', 'periodo_dia_tarde', 'OPERA_Aeromexico']]

In [None]:
# Se une a dummies la variable dependiente y luego se desordena el data frame

df = dummies.merge(atraso_15, how='inner', left_index=True, right_index=True)
df = shuffle(df, random_state = 2000)
df

Unnamed: 0,MES_7,C_J_destinos_3,C_J_aerolineas_2,C_J_aerolineas_1,periodo_dia_mañana,C_J_destinos_1,MES_4,OPERA_Latin American Wings,MES_12,MES_3,OPERA_Grupo LATAM,OPERA_Aerolineas Argentinas,DIANOM_Martes,DIANOM_Jueves,SIGLADES_Antofagasta,OPERA_Avianca,periodo_dia_tarde,OPERA_Aeromexico,atraso_15
40560,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
46876,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
20541,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
40783,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
28964,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58780,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
37148,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
34358,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
62670,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split 

np.random.seed(100)
train, test = train_test_split(df, test_size = 0.33)

print("Ejemplos usados para entrenar: ", len(train))
print("Ejemplos usados para test: ", len(test))

Ejemplos usados para entrenar:  45698
Ejemplos usados para test:  22508


In [None]:
train.head()

Unnamed: 0,MES_7,C_J_destinos_3,C_J_aerolineas_2,C_J_aerolineas_1,periodo_dia_mañana,C_J_destinos_1,MES_4,OPERA_Latin American Wings,MES_12,MES_3,OPERA_Grupo LATAM,OPERA_Aerolineas Argentinas,DIANOM_Martes,DIANOM_Jueves,SIGLADES_Antofagasta,OPERA_Avianca,periodo_dia_tarde,OPERA_Aeromexico,atraso_15
24287,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
20123,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
31190,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
46055,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41883,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0


In [None]:
test.head() 

Unnamed: 0,MES_7,C_J_destinos_3,C_J_aerolineas_2,C_J_aerolineas_1,periodo_dia_mañana,C_J_destinos_1,MES_4,OPERA_Latin American Wings,MES_12,MES_3,OPERA_Grupo LATAM,OPERA_Aerolineas Argentinas,DIANOM_Martes,DIANOM_Jueves,SIGLADES_Antofagasta,OPERA_Avianca,periodo_dia_tarde,OPERA_Aeromexico,atraso_15
31631,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
59209,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
48585,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
49939,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
21673,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Equal size sampling. 
De la muestra de entrenamiento, se eligen todos los 1, luego de manera aleatoria, se eligen la misma cantidad de ceros que de 1.

In [None]:
# Obtengo las cantidades de 1 y 0 de la muestra de entrenamiento

subset_train = train['atraso_15'].value_counts() 
print(subset_train)

0    37227
1     8471
Name: atraso_15, dtype: int64


In [None]:
# Se obtiene el número de la muestra a extraer para igualar clases de entrenamiento

muestra = subset_train.iloc [-1]
print(muestra)

8471


In [None]:
# De la muestra de entrenamiento se eligen todos los 0.

condition = train['atraso_15'] == 0 
true_index = condition[condition == True].index
len(true_index)

37227

In [None]:
# Se hace la extracción aleatoria de la "muestra"

np.random.seed(5891)
subset_2_train = train[condition].sample(n = muestra)
subset_2_train.shape

(8471, 19)

In [None]:
# Se obtienen todas las observaciones "1" del training

resto = train['atraso_15'] == 1
resto_filtrado = train[resto]
print(resto_filtrado)

       MES_7  C_J_destinos_3  C_J_aerolineas_2  C_J_aerolineas_1  \
43552      0               1                 0                 1   
11073      0               0                 0                 1   
42110      0               0                 0                 1   
35966      0               1                 0                 1   
62949      0               0                 1                 0   
...      ...             ...               ...               ...   
66642      0               0                 0                 1   
47272      0               0                 0                 1   
35278      0               0                 0                 1   
66504      0               1                 0                 1   
12821      0               1                 0                 1   

       periodo_dia_mañana  C_J_destinos_1  MES_4  OPERA_Latin American Wings  \
43552                   0               0      0                           0   
11073                  

In [None]:
# Se unen ambos data frame para conformar la base de entrenamiento que está equilibrada y lista para el modelamiento

train_equal = pd.concat([subset_2_train, resto_filtrado])
train_equal

Unnamed: 0,MES_7,C_J_destinos_3,C_J_aerolineas_2,C_J_aerolineas_1,periodo_dia_mañana,C_J_destinos_1,MES_4,OPERA_Latin American Wings,MES_12,MES_3,OPERA_Grupo LATAM,OPERA_Aerolineas Argentinas,DIANOM_Martes,DIANOM_Jueves,SIGLADES_Antofagasta,OPERA_Avianca,periodo_dia_tarde,OPERA_Aeromexico,atraso_15
48809,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
42237,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
64917,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
50094,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58707,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66642,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
47272,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1
35278,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
66504,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1


In [None]:
# Obtengo las cantidades de 1 y 0 de la nueva muestra de entrenamiento de la variable dependiente a modelar, que ahora está equilibrada

train_equal['atraso_15'].value_counts()

0    8471
1    8471
Name: atraso_15, dtype: int64

In [None]:
# Obtengo las cantidades de 1 y 0 de la muestra de testing de la variable dependiente a modelar, la cual no está equilibrada

test['atraso_15'].value_counts()

0    18365
1     4143
Name: atraso_15, dtype: int64

In [None]:
# Librerías necesarias para modelamiento

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Se desordena la base de entrenamiento y la de test

train_equal = shuffle(train_equal, random_state = 321)
test = shuffle(test, random_state = 321)

In [None]:
# La variable a modelar debe quedar sola en una serie, tanto en train como en test

y_train = pd.Series(train_equal['atraso_15'])
y_test = pd.Series(test['atraso_15'])

In [None]:
# Ahora se debe eliminar la variable dependiente de los data frame en train y test

x_train = train_equal.drop(columns = "atraso_15")
x_test = test.drop(columns = "atraso_15")

#### Utilizando XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
# Se hace el modelo de clasificación usando XGBoost y las mejores variables

modelxgb_best = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
modelxgb_best = modelxgb_best.fit(x_train, y_train)

In [None]:
y_predxgb_best = modelxgb_best.predict(x_test)

#### Métricas XGBoost

In [None]:
confusion_matrix(y_test, y_predxgb_best)

array([[11176,  7189],
       [ 1683,  2460]])

In [None]:
print(classification_report(y_test, y_predxgb_best))

              precision    recall  f1-score   support

           0       0.87      0.61      0.72     18365
           1       0.25      0.59      0.36      4143

    accuracy                           0.61     22508
   macro avg       0.56      0.60      0.54     22508
weighted avg       0.76      0.61      0.65     22508



In [None]:
parameters = {
                'learning_rate': [0.01,0.05, 0.1],
                'n_estimators': [50, 100, 150],
                'subsample': [0.5, 0.9]
}

modelxgb_best_GridCV = GridSearchCV(modelxgb_best, param_grid = parameters,
    cv = 2, n_jobs=-1, verbose=1).fit(x_train, y_train)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


In [None]:
y_predxgb_best_GridCV = modelxgb_best_GridCV.predict(x_test)

#### Métricas XGBoost para GRID CV

In [None]:
confusion_matrix(y_test, y_predxgb_best_GridCV)

array([[11243,  7122],
       [ 1520,  2623]])

In [None]:
print(classification_report(y_test, y_predxgb_best_GridCV))

              precision    recall  f1-score   support

           0       0.88      0.61      0.72     18365
           1       0.27      0.63      0.38      4143

    accuracy                           0.62     22508
   macro avg       0.58      0.62      0.55     22508
weighted avg       0.77      0.62      0.66     22508



### Este modelo tiene un Accuracy del 62%, una Sensitivity del 63% y una Precision para la clase 1 del 27%. Se ha logrado mejorar esta última métrica desde un 19 a un 27%.