In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
df = pd.read_csv('../data/dataset_SCL.csv')

  df = pd.read_csv('../data/dataset_SCL.csv')


In [3]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES
32631,2017-07-07 15:30:00,1285,SCEL,SABE,ARG,2017-07-07 16:46:00,1285.0,SCEL,SABE,ARG,7,7,2017,Viernes,I,Aerolineas Argentinas,Santiago,Buenos Aires
67075,2017-12-16 19:55:00,704,SCEL,LEMD,LAN,2017-12-16 20:01:00,704.0,SCEL,LEMD,LAN,16,12,2017,Sabado,I,Grupo LATAM,Santiago,Madrid
46313,2017-09-09 18:25:00,531,SCEL,SACO,SKU,2017-09-09 18:51:00,531.0,SCEL,SACO,SKU,9,9,2017,Sabado,I,Sky Airline,Santiago,Cordoba


# Generación de variable objetivo:

Se generará una variable objetivo para pruebas y ver como afectan algunas variables el atraso o no en los vuelos. Esto será para el análisis exploratorio y el entrenamiento. 

La variable se construirá así:
$$ x = fecha_{operacion} - fecha_{programada}$$
Tomando en cuenta $x$ en minutos, se construirá la siguiente variable del tipo binaria:
$$ y = \begin{cases}
1 & \text{ if } x>= \lambda [min] \\
0 & \text{ if } x< \lambda [min]
\end{cases} $$

Donde $\lambda$ corresponde a un $threshold$ definido por el usuario. En este caso se utilizará 15 [min] tomando en cuenta el trabajo realizado por Juan.

Adicionalmente, se tomarán en cuenta las variables creadas en el notebook `to-expose.ipynb` y/o se crearán algunas variables adicionales.

In [4]:
def get_target(data, threshold=15):
    fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
    fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
    dif_min = ((fecha_o - fecha_i).total_seconds())/60
    return 1 if dif_min >= threshold else 0

In [5]:
df['target'] = df.apply(get_target, axis=1)

In [6]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES,target
607,2017-01-16 01:42:00,276,SCEL,MPTO,CMP,2017-01-16 01:42:00,276,SCEL,MPTO,CMP,16,1,2017,Lunes,I,Copa Air,Santiago,Ciudad de Panama,0
35442,2017-07-03 11:58:00,170,SCEL,SCDA,LAN,2017-07-03 11:58:00,170,SCEL,SCDA,LAN,3,7,2017,Lunes,N,Grupo LATAM,Santiago,Iquique,0
41312,2017-08-02 12:00:00,170,SCEL,SCDA,LAN,2017-08-02 12:05:00,170,SCEL,SCDA,LAN,2,8,2017,Miercoles,N,Grupo LATAM,Santiago,Iquique,0


In [7]:
def calcular_tasa(df, columna):
    
    dic_atrasos = {}
    for _, row in df.iterrows():
        if row['target'] == 1:
            if row[columna] not in dic_atrasos:
                dic_atrasos[row[columna]] = 1
            else:
                dic_atrasos[row[columna]] += 1
    
    total_values = df[columna].value_counts()
    
    dic_tasas = {}
    for name, total in total_values.iteritems():
        if name in dic_atrasos:
            dic_tasas[name] = round(total / dic_atrasos[name], 2)
        else:
            dic_tasas[name] = 0
            
    return pd.DataFrame.from_dict(data = dic_tasas, orient = 'index', columns = ['Tasa (%)'])

In [8]:
tasas_destinos = calcular_tasa(df, 'SIGLADES')
tasas_destinos

  for name, total in total_values.iteritems():


Unnamed: 0,Tasa (%)
Buenos Aires,3.63
Antofagasta,6.36
Lima,4.04
Calama,7.57
Puerto Montt,5.45
...,...
Quito,1.00
Washington,0.00
"Pisco, Peru",0.00
Puerto Stanley,1.00


In [9]:
df['Fecha-I'] = pd.to_datetime(df['Fecha-I'])
df['DIA-I'] = df['Fecha-I'].dt.day
df['MES-I'] = df['Fecha-I'].dt.month
df['HORA-I'] = df['Fecha-I'].dt.hour

In [10]:
df.sample(3)

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,...,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES,target,DIA-I,MES-I,HORA-I
49544,2017-09-28 06:30:00,1031,SCEL,SCIE,LAW,2017-09-28 06:29:00,1031,SCEL,SCIE,JMR,...,2017,Jueves,N,Latin American Wings,Santiago,Concepcion,0,28,9,6
45276,2017-09-04 09:10:00,1,SCEL,SCTE,SKU,2017-09-04 09:16:00,1,SCEL,SCTE,SKU,...,2017,Lunes,N,Sky Airline,Santiago,Puerto Montt,0,4,9,9
42702,2017-08-17 15:55:00,410,SCEL,SUMU,LAN,2017-08-17 15:57:00,410,SCEL,SUMU,LAN,...,2017,Jueves,I,Grupo LATAM,Santiago,Montevideo,0,17,8,15


# Transformación de algunas columnas:

Las columnas correspondientes a fechas/marcas temporales se codificarán con una transformada Seno-Coseno para representar de mejor forma el ciclo en estas. Es decir, que las 23.59 están mas cerca de las 00.00 que de las 23.50. Esto puede ayudar a entender mejor algunas componentes temporales y mejorar predicciones.

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

class CyclicalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column_names):
        self.column_names = column_names
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        for col in self.column_names:
            radians = 2*np.pi*X[col]/X[col].max()
            X[col+"_sin"] = np.sin(radians)
            X[col+"_cos"] = np.cos(radians)
            X.drop(columns=[col], inplace=True)
        return X

In [12]:
date_cols = ['DIA-I', 'MES-I', 'HORA-I']
categorical_cols = ['OPERA', 'TIPOVUELO']

In [13]:
featurisation = ColumnTransformer(transformers=[
    ('cyclic', CyclicalEncoder(date_cols), date_cols),
    ('cat', OneHotEncoder(), categorical_cols)
])


# Modelo regresión logística:

Se entrenará este modelo por la "simpleza". La idea es ir rápido para tener una versión 0 de esto funcionando.

In [14]:
# columnas a usar
columns = ['DIA-I', 'MES-I', 'HORA-I', 'OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'target']

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [16]:
log_reg = LogisticRegression()

In [17]:
pipeline = Pipeline(
    [
        ('features', featurisation),
        ('learner', log_reg)
    ]
)

In [18]:
# train-test split
train_data, test_data = train_test_split(
    shuffle(df[columns]),
    random_state=0,
    test_size = 0.33
)

# model training
model = pipeline.fit(
    train_data[filter(lambda x: x != 'target', columns)],
    train_data['target']
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
y_pred = model.predict(test_data)

print(
    "Classification Report: \n",
    classification_report(test_data['target'], y_pred)
 )

print(
    "Confusion Matrix: \n",
    confusion_matrix(test_data['target'], y_pred)
)

Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.99      0.89     18048
           1       0.58      0.04      0.07      4460

    accuracy                           0.80     22508
   macro avg       0.69      0.51      0.48     22508
weighted avg       0.76      0.80      0.73     22508

Confusion Matrix: 
 [[17933   115]
 [ 4303   157]]


# Probemos con upsampling y otro modelo:

In [20]:
#Resample
from sklearn.utils import resample
from sklearn.svm import SVC

data_no_atraso = df[df['target'] == 0]
data_atraso = df[df['target'] == 1]

data_atraso_upsampled = resample(
    data_atraso, 
    replace = True,     # sample with replacement
    n_samples = 30000,    # to match majority class
    random_state = 42
) # reproducible results

data_upsampled = pd.concat([data_no_atraso, data_atraso_upsampled])

In [21]:
svc = SVC(gamma='auto')
pipeline_svc = Pipeline(
    [
        ('features', featurisation),
        ('learner', svc)
    ]
)

In [22]:
# train-test split
upsampled_train_data, upsampled_test_data = train_test_split(
    shuffle(data_upsampled[columns]),
    random_state=0,
    test_size = 0.33
)

# model training
model_svc = pipeline_svc.fit(
    upsampled_train_data[filter(lambda x: x != 'target', columns)],
    upsampled_train_data['target']
)

In [23]:
y_pred = model_svc.predict(upsampled_test_data)

print(
    "Classification Report: \n",
    classification_report(upsampled_test_data['target'], y_pred)
 )

print(
    "Confusion Matrix: \n",
    confusion_matrix(upsampled_test_data['target'], y_pred)
)

Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.97      0.79     18116
           1       0.65      0.11      0.19      9799

    accuracy                           0.67     27915
   macro avg       0.66      0.54      0.49     27915
weighted avg       0.66      0.67      0.58     27915

Confusion Matrix: 
 [[17530   586]
 [ 8691  1108]]


# Dump del modelo a .joblib

Se cargará la regresión lineal, dado que obtiene mejor accuracy en general. Aun dado que SVC predice mejor para los atrasos, da muchos falsos positivos. 

In [26]:
import joblib

In [28]:
joblib.dump(model, '../models/delay/model_v0.joblib')

['../models/delay/model_v0.joblib']