In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [4]:
df = pd.read_csv('../data/dataset_SCL.csv')

  df = pd.read_csv('../data/dataset_SCL.csv')


In [5]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES
54013,2017-10-07 15:40:00,410,SCEL,SUMU,LAN,2017-10-07 15:46:00,410,SCEL,SUMU,LAN,7,10,2017,Sabado,I,Grupo LATAM,Santiago,Montevideo
28427,2017-06-30 06:45:00,8161,SCEL,SBGR,TAM,2017-06-30 06:58:00,8161,SCEL,SBGR,TAM,30,6,2017,Viernes,I,Grupo LATAM,Santiago,Sao Paulo
7562,2017-02-13 09:50:00,104,SCEL,SCSE,SKU,2017-02-13 09:52:00,104,SCEL,SCSE,SKU,13,2,2017,Lunes,N,Sky Airline,Santiago,La Serena


# Generación de variable objetivo:

Se generará una variable objetivo para pruebas y ver como afectan algunas variables el atraso o no en los vuelos. Esto será para el análisis exploratorio y el entrenamiento. 

La variable se construirá así:
$$ x = fecha_{operacion} - fecha_{programada}$$
Tomando en cuenta $x$ en minutos, se construirá la siguiente variable del tipo binaria:
$$ y = \begin{cases}
1 & \text{ if } x>= \lambda [min] \\
0 & \text{ if } x< \lambda [min]
\end{cases} $$

Donde $\lambda$ corresponde a un $threshold$ definido por el usuario. En este caso se utilizará 15 [min] tomando en cuenta el trabajo realizado por Juan.

Adicionalmente, se tomarán en cuenta las variables creadas en el notebook `to-expose.ipynb` y/o se crearán algunas variables adicionales.

In [6]:
def get_target(data, threshold=15):
    fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
    fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
    dif_min = ((fecha_o - fecha_i).total_seconds())/60
    return 1 if dif_min >= threshold else 0

In [7]:
df['target'] = df.apply(get_target, axis=1)

In [8]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES,target
9215,2017-02-19 12:55:00,209,SCEL,SCIE,LAN,2017-02-19 12:54:00,209,SCEL,SCIE,LXP,19,2,2017,Domingo,N,Grupo LATAM,Santiago,Concepcion,0
59750,2017-11-19 19:31:00,322,SCEL,SCFA,LAN,2017-11-19 19:34:00,322,SCEL,SCFA,LXP,19,11,2017,Domingo,N,Grupo LATAM,Santiago,Antofagasta,0
23141,2017-05-17 08:45:00,8125,SCEL,SBGL,TAM,2017-05-17 09:10:00,8125,SCEL,SBGL,TAM,17,5,2017,Miercoles,I,Grupo LATAM,Santiago,Rio de Janeiro,1


In [9]:
def calcular_tasa(df, columna):
    
    dic_atrasos = {}
    for _, row in df.iterrows():
        if row['target'] == 1:
            if row[columna] not in dic_atrasos:
                dic_atrasos[row[columna]] = 1
            else:
                dic_atrasos[row[columna]] += 1
    
    total_values = df[columna].value_counts()
    
    dic_tasas = {}
    for name, total in total_values.iteritems():
        if name in dic_atrasos:
            dic_tasas[name] = round(total / dic_atrasos[name], 2)
        else:
            dic_tasas[name] = 0
            
    return pd.DataFrame.from_dict(data = dic_tasas, orient = 'index', columns = ['Tasa (%)'])

In [10]:
tasas_destinos = calcular_tasa(df, 'SIGLADES')
tasas_destinos

  for name, total in total_values.iteritems():


Unnamed: 0,Tasa (%)
Buenos Aires,3.63
Antofagasta,6.36
Lima,4.04
Calama,7.57
Puerto Montt,5.45
...,...
Quito,1.00
Washington,0.00
"Pisco, Peru",0.00
Puerto Stanley,1.00


In [11]:
df['Fecha-I'] = pd.to_datetime(df['Fecha-I'])
df['DIA-I'] = df['Fecha-I'].dt.day
df['MES-I'] = df['Fecha-I'].dt.month
df['HORA-I'] = df['Fecha-I'].dt.hour

In [12]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,...,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES,target,DIA-I,MES-I,HORA-I
62349,2017-12-24 12:05:00,118,SCEL,MPTO,CMP,2017-12-24 12:00:00,118,SCEL,MPTO,CMP,...,2017,Domingo,I,Copa Air,Santiago,Ciudad de Panama,0,24,12,12
52916,2017-10-18 18:02:00,217,SCEL,SCIE,LAN,2017-10-18 18:08:00,217,SCEL,SCIE,LXP,...,2017,Miercoles,N,Grupo LATAM,Santiago,Concepcion,0,18,10,18
16205,2017-03-08 01:40:00,572,SCEL,SKBO,LAN,2017-03-08 01:34:00,572,SCEL,SKBO,LAN,...,2017,Miercoles,I,Grupo LATAM,Santiago,Bogota,0,8,3,1


# Transformación de algunas columnas:

Las columnas correspondientes a fechas/marcas temporales se codificarán con una transformada Seno-Coseno para representar de mejor forma el ciclo en estas. Es decir, que las 23.59 están mas cerca de las 00.00 que de las 23.50. Esto puede ayudar a entender mejor algunas componentes temporales y mejorar predicciones.

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from challenge_neueralworks.src.models.delay.preprocesing import CyclicalEncoder

In [20]:
date_cols = ['DIA-I', 'MES-I', 'HORA-I']
categorical_cols = ['OPERA', 'TIPOVUELO']

In [21]:
featurisation = ColumnTransformer(transformers=[
    ('cyclic', CyclicalEncoder(date_cols), date_cols),
    ('cat', OneHotEncoder(), categorical_cols)
])


# Modelo regresión logística:

Se entrenará este modelo por la "simpleza". La idea es ir rápido para tener una versión 0 de esto funcionando.

In [22]:
# columnas a usar
columns = ['DIA-I', 'MES-I', 'HORA-I', 'OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'target']

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [24]:
log_reg = LogisticRegression()

In [25]:
pipeline = Pipeline(
    [
        ('features', featurisation),
        ('learner', log_reg)
    ]
)

In [26]:
# train-test split
train_data, test_data = train_test_split(
    shuffle(df[columns]),
    random_state=0,
    test_size = 0.33
)

# model training
model = pipeline.fit(
    train_data[filter(lambda x: x != 'target', columns)],
    train_data['target']
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
train_data[filter(lambda x: x != 'target', columns)]

Unnamed: 0,DIA-I,MES-I,HORA-I,OPERA,MES,TIPOVUELO,SIGLADES,DIANOM
63878,23,12,19,Sky Airline,12,N,Antofagasta,Sabado
60518,9,11,23,Grupo LATAM,11,I,Miami,Jueves
64975,7,12,7,Grupo LATAM,12,N,Iquique,Jueves
17206,6,4,22,American Airlines,4,I,Miami,Jueves
13302,22,3,18,Sky Airline,3,N,Calama,Miercoles
...,...,...,...,...,...,...,...,...
66084,29,12,12,Grupo LATAM,12,N,Calama,Viernes
62917,3,12,8,Grupo LATAM,12,I,Rio de Janeiro,Domingo
54634,13,10,23,Grupo LATAM,10,I,Los Angeles,Viernes
25363,3,5,16,Grupo LATAM,5,N,Puerto Montt,Miercoles


In [41]:
train_data.iloc[0].to_dict()

{'DIA-I': 23,
 'MES-I': 12,
 'HORA-I': 19,
 'OPERA': 'Sky Airline',
 'MES': 12,
 'TIPOVUELO': 'N',
 'SIGLADES': 'Antofagasta',
 'DIANOM': 'Sabado',
 'target': 0}

In [33]:
y_pred = model.predict(test_data)

print(
    "Classification Report: \n",
    classification_report(test_data['target'], y_pred)
 )

print(
    "Confusion Matrix: \n",
    confusion_matrix(test_data['target'], y_pred)
)

Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.99      0.89     18079
           1       0.56      0.03      0.07      4429

    accuracy                           0.80     22508
   macro avg       0.68      0.51      0.48     22508
weighted avg       0.76      0.80      0.73     22508

Confusion Matrix: 
 [[17958   121]
 [ 4275   154]]


# Probemos con upsampling y otro modelo:

In [34]:
#Resample
from sklearn.utils import resample
from sklearn.svm import SVC

data_no_atraso = df[df['target'] == 0]
data_atraso = df[df['target'] == 1]

data_atraso_upsampled = resample(
    data_atraso, 
    replace = True,     # sample with replacement
    n_samples = 30000,    # to match majority class
    random_state = 42
) # reproducible results

data_upsampled = pd.concat([data_no_atraso, data_atraso_upsampled])

In [35]:
svc = SVC(gamma='auto')
pipeline_svc = Pipeline(
    [
        ('features', featurisation),
        ('learner', svc)
    ]
)

In [36]:
# train-test split
upsampled_train_data, upsampled_test_data = train_test_split(
    shuffle(data_upsampled[columns]),
    random_state=0,
    test_size = 0.33
)

# model training
model_svc = pipeline_svc.fit(
    upsampled_train_data[filter(lambda x: x != 'target', columns)],
    upsampled_train_data['target']
)

In [37]:
y_pred = model_svc.predict(upsampled_test_data)

print(
    "Classification Report: \n",
    classification_report(upsampled_test_data['target'], y_pred)
 )

print(
    "Confusion Matrix: \n",
    confusion_matrix(upsampled_test_data['target'], y_pred)
)

Classification Report: 
               precision    recall  f1-score   support

           0       0.66      0.97      0.79     17922
           1       0.65      0.11      0.19      9993

    accuracy                           0.66     27915
   macro avg       0.66      0.54      0.49     27915
weighted avg       0.66      0.66      0.57     27915

Confusion Matrix: 
 [[17337   585]
 [ 8899  1094]]


# Dump del modelo a .joblib

Se cargará la regresión lineal, dado que obtiene mejor accuracy en general. Aun dado que SVC predice mejor para los atrasos, da muchos falsos positivos. 

In [38]:
import dill as pickle

In [39]:
model.steps

[('features',
  ColumnTransformer(transformers=[('cyclic',
                                   CyclicalEncoder(column_names=['DIA-I', 'MES-I',
                                                                 'HORA-I']),
                                   ['DIA-I', 'MES-I', 'HORA-I']),
                                  ('cat', OneHotEncoder(),
                                   ['OPERA', 'TIPOVUELO'])])),
 ('learner', LogisticRegression())]

In [40]:
pickle.dump(model, open('../models/delay/model_v0.pkl', 'wb'), byref=False)