In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import Markdown as md
from datetime import date, datetime
%matplotlib inline
#sns.set(rc={"figure.figsize":(20, 20)})

from pandas import DataFrame
from typing import List
%matplotlib inline
import numpy as np

In [90]:
# COLUMNS
FECHAI = 'Fecha-I'
FECHAO = 'Fecha-O'
VLOO = 'Vlo-O'
ORIO = 'Ori-O'
DESO = 'Des-O'
EMPO = 'Emp-O'
VLOI = 'Vlo-I'
DIA = 'DIA'
MES = 'MES'
ANIO = 'AÑO'
DIANOM = 'DIANOM'
TIPOVUELO = 'TIPOVUELO'
OPERA = 'OPERA'
SIGLADES = 'SIGLADES'
COHORT = 'cohort'
# 
TEMPORADA_ALTA = 'temporada_alta'
DIF_MIN = 'dif_min'
ATRASO_15 = 'atraso_15'
PERIODO_DIA = 'periodo_dia'
MORNING = 'mañana'
AFTERNOON = 'tarde'
NIGHT = 'noche'


DATE = 'Date'
TIME = 'time'
HOUR = 'hour'
MINUTE = 'minute'


COLUM_FILTER = [FECHAI,VLOI, DIA, MES, ANIO, DIANOM, TIPOVUELO, OPERA, SIGLADES, TEMPORADA_ALTA, ATRASO_15, PERIODO_DIA]
COLUM_SYNTHETIC_FEATURES = [TEMPORADA_ALTA, DIF_MIN, ATRASO_15, PERIODO_DIA, 'AnteriorDelay', 'AnteriorEarly']




STR_WINTER_START = '2017-12-15'
STR_WINTER_END = '2017-12-31'
STR_WINTER_BIS_START = '2017-01-01'
STR_WINTER_BIS_END = '2017-03-03'
STR_JULY_START = '2017-07-15'
STR_JULY_END = '2017-07-31'
STR_SEPT_START = '2017-09-11'
STR_SEPT_END = '2017-09-30'


STR_MANANA_START= '5:00'
STR_MANANA_END = '11:59'
STR_TARDE_START = '12:00'
STR_TARDE_END = '18:59'

PARAM_GRID = {
    'n_estimators': [10, 20, 50, 10, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 8, 16, 32, 64, 128, 256, 512],
    'criterion': ['gini', 'entropy'],
    'n_jobs': [2]
}


columns = [TEMPORADA_ALTA, DIF_MIN, ATRASO_15, PERIODO_DIA]
def convert_str_to_date(str_date: str) -> datetime:
    """convert string to date format '%Y-%m-%d'

    Args:
        str_date (str): string ex: '2017-01-01'

    Returns:
        datetime: string with datetime type
    """
    return datetime.strptime(str_date, '%Y-%m-%d').date()

def convert_str_to_time(str_date: str) -> datetime:
    """convert string to time format '%H:%M'

    Args:
        str_date (str): string '23:59'

    Returns:
        datetime: string with datetime type
    """
    return datetime.strptime(str_date, '%H:%M').time()

def add_temporada_alta_flag(df: DataFrame, dates: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    df[TEMPORADA_ALTA] = 0
    for date in dates:
        df.loc[(df[DATE] >= date[0]) & (df[DATE]<=date[1]), TEMPORADA_ALTA] = 1
    return df.copy()
def add_periodo_dia_flag(df: DataFrame, times: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    # default
    df[PERIODO_DIA] = NIGHT
    df.loc[(df[TIME] >= times[0][0]) & (df[TIME]<=times[0][1]), PERIODO_DIA] = MORNING
    df.loc[(df[TIME] >= times[1][0]) & (df[TIME]<=times[1][1]), PERIODO_DIA] = AFTERNOON
    return df.copy()

In [91]:
def generate_temporada_alta_set()-> list:
    """
    This function generate arrays to generate temporada_alta elements, source from constant
    dates ahre the following, 
        15-Dic y 3-Mar, o 15-Jul y 31-Jul, o 11-Sep y 30-Sep,
    output: array of element, with the following 
    """
    str_winter_start = convert_str_to_date(STR_WINTER_START)
    str_winter_end = convert_str_to_date(STR_WINTER_END)
    str_winter_bis_start = convert_str_to_date(STR_WINTER_BIS_START)
    str_winter_bis_end = convert_str_to_date(STR_WINTER_BIS_END)
    str_July_start = convert_str_to_date(STR_JULY_START)
    str_July_end = convert_str_to_date(STR_JULY_END)
    str_Sept_start = convert_str_to_date(STR_SEPT_START)
    str_Sept_end = convert_str_to_date(STR_SEPT_END)

    dates = [[str_winter_start, str_winter_end],
             [str_winter_bis_start, str_winter_bis_end],
             [str_July_start, str_July_end],
             [str_Sept_start, str_Sept_end]]
    return dates

def generate_day_section():
    """
    split a day iin three part, morning, afternoon and night
    """
    str_mañana_start = convert_str_to_time(STR_MANANA_START)
    str_mañana_end = convert_str_to_time(STR_MANANA_END)
    str_tarde_start = convert_str_to_time(STR_TARDE_START)
    str_tarde_end = convert_str_to_time(STR_TARDE_END)
    times = [[str_mañana_start, str_mañana_end],
             [str_tarde_start, str_tarde_end],
            ]
    return times

def split_date(df:DataFrame, column_name:str)-> DataFrame:
    """_summary_

    Args:
        df (DataFrame): _description_
        column_name (str): _description_

    Returns:
        DataFrame: _description_
    """
    df[column_name] = pd.to_datetime(df[column_name])
    df[DATE] = df[column_name].dt.date
    df[TIME] = df[column_name].dt.time
    return df.copy()

def add_temporada_alta_flag(df: DataFrame, dates: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    df[TEMPORADA_ALTA] = 0
    for date in dates:
        df.loc[(df[DATE] >= date[0]) & (df[DATE]<=date[1]), TEMPORADA_ALTA] = 1
    return df.copy()

def add_periodo_dia_flag(df: DataFrame, times: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    # default
    df[PERIODO_DIA] = NIGHT
    df.loc[(df[TIME] >= times[0][0]) & (df[TIME]<=times[0][1]), PERIODO_DIA] = MORNING
    df.loc[(df[TIME] >= times[1][0]) & (df[TIME]<=times[1][1]), PERIODO_DIA] = AFTERNOON
    return df.copy()

def add_dif_min_and_atraso_15(df:DataFrame) ->DataFrame:
    df[DIF_MIN] = df[FECHAO]-df[FECHAI]
    df[DIF_MIN] = df[DIF_MIN].dt.total_seconds()//60
    df[ATRASO_15] = 0
    df.loc[df[DIF_MIN]>15, ATRASO_15] = 1
    return df



In [92]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score

In [93]:
def delete_operational_columns(df:DataFrame)-> DataFrame:
    del df[FECHAO]
    del df[VLOO]
    del df[ORIO]
    del df[DESO]
    del df[EMPO]
    return df

def filter_column(df:DataFrame, columns:str =  COLUM_FILTER)->DataFrame:
    return df[columns]

def previous_flight (df):
    df["Early"] = df[DIF_MIN].shift(-1)
    df['AnteriorEarly'] = 0
    df.loc[df["Early"]<0, 'AnteriorEarly'] = 1
    df["Delay"] = df[ATRASO_15].shift(-1)
    df['AnteriorDelay'] = 0
    df.loc[df["Delay"]>0, 'AnteriorDelay'] = 1
    return df

def one_hot_encoder(df:DataFrame)-> DataFrame:
    
    df = pd.get_dummies(df, columns = [OPERA])
    df = pd.get_dummies(df, columns = [SIGLADES])
    df = pd.get_dummies(df, columns = [TEMPORADA_ALTA]) 
    df = pd.get_dummies(df, columns = [PERIODO_DIA]) 
    df = pd.get_dummies(df, columns = [VLOI])
    df = pd.get_dummies(df, columns = [DIANOM])
    return df 


def to_bool_tipo_vuelo(df:DataFrame)-> DataFrame:
    df[f"{TIPOVUELO}_"] = df[TIPOVUELO]
    df.loc[df[f"{TIPOVUELO}_"]== 'I', TIPOVUELO] = 1
    df.loc[df[f"{TIPOVUELO}_"]== 'N', TIPOVUELO] = 0
    del df[f"{TIPOVUELO}_"]
    return df

def fit_grid_model(
    rfc: RandomForestClassifier,
    param_grid,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
):  # pragma: no cover
    """GridSeach

    Args:
        rfc (RandomForestClassifier): ML Model
        param_grid (dict): JSON parameters
        X_train (pd.DataFrame): train df
        y_train (pd.DataFrame): goal df
        model_type (str): model name

    Returns:
        [type]: model trained
    """
    model = GridSearchCV(
        estimator=rfc,
        param_grid=param_grid,
        cv=3,
    )
    model.fit(X_train, y_train)
    return model

In [94]:
dates = generate_temporada_alta_set()
times = generate_day_section()


In [23]:
df = pd.read_csv("../sources/raw/dataset_SCL.csv", dtype={'Vlo-O':str, 'Vlo-I':str})
df[FECHAO] = pd.to_datetime(df[FECHAO])
df = (df.pipe(split_date, FECHAI)
        .pipe(add_temporada_alta_flag, dates)
        .pipe(add_periodo_dia_flag, times)
        #.pipe(apply_cohort)
        .pipe(add_dif_min_and_atraso_15)
        .pipe(delete_operational_columns)
        .pipe(filter_column)
        .pipe(to_bool_tipo_vuelo)
        #.pipe(one_hot_encoder) # generate one_hot_encoder
     )

In [24]:

df1 = df.copy()
df = one_hot_encoder(df)

no sirven
* Ori-I y Pri-o un sólo lugar de origen
* Des-I y SIGLADES son lo mismo
- OPERA == Emp-I

modificadores
- Vlo-I != Vlo-O, es diferente hubo atrasosi el número de vuelo cambio o el origen cambio, significa un retraso? Des-I
- Des-I != Des-O, es diferente, atraso
# no se usaran para evitar sesgos
- Fecha-O


In [26]:
df

Unnamed: 0,Fecha-I,DIA,MES,AÑO,TIPOVUELO,atraso_15,OPERA_Aerolineas Argentinas,OPERA_Aeromexico,OPERA_Air Canada,OPERA_Air France,...,Vlo-I_993,Vlo-I_9955,Vlo-I_9956,DIANOM_Domingo,DIANOM_Jueves,DIANOM_Lunes,DIANOM_Martes,DIANOM_Miercoles,DIANOM_Sabado,DIANOM_Viernes
0,2017-01-01 23:30:00,1,1,2017,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2017-01-02 23:30:00,2,1,2017,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2017-01-03 23:30:00,3,1,2017,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2017-01-04 23:30:00,4,1,2017,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2017-01-05 23:30:00,5,1,2017,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68201,2017-12-22 14:55:00,22,12,2017,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
68202,2017-12-25 14:55:00,25,12,2017,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
68203,2017-12-27 14:55:00,27,12,2017,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
68204,2017-12-29 14:55:00,29,12,2017,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
columns = df.columns
columns = columns.tolist()
columns.remove(FECHAI)
df = df[columns]
columns.remove(ATRASO_15)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42,
    )

In [29]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
 

In [33]:
# Select 4 new classifiers
clfs = {"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99999990 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier()
       }

In [34]:
from sklearn.inspection import permutation_importance

def feature_importance(clf, X, y, top_limit=None):

    # Retrieve the Bunch object after 50 repeats
    # n_repeats is the number of times that each feature was permuted to compute the final score
    bunch = permutation_importance(clf, X, y,
                                 n_repeats=50, random_state=42)

    # Average feature importance
    imp_means = bunch.importances_mean

    # List that contains the index of each feature in descending order of importance
    ordered_imp_means_args = np.argsort(imp_means)[::-1]

    # If no limit print all features
    if top_limit is None:
        top_limit = len(ordered_imp_means_args)

    # Print relevant information
    for i, _ in zip(ordered_imp_means_args, range(top_limit)):
        name = data.feature_names[i]
        imp_score = imp_means[i]
        imp_std = bunch.importances_std[i]
        print(f"Feature {name} with index {i} has an average importance score of {imp_score:.3f} +/- {imp_std:.3f}\n")
        # Compute feature importance on the test set given a classifier

def fit_compute_importance(clf, k = 4):
    kf = KFold(n_splits=k, random_state=None)
    result = cross_val_score(clf , X_train, y_train, cv = kf)

    print(f"cross-validation Avg accuracy: {result.mean()}")
    
    clf.fit(X_train, y_train)
    print(f"Mean accuracy score on the test set: {clf.score(X_test, y_test)*100:.2f}%\n")
    # print(f"📏 Mean accuracy score on the test set: {clf.score(X_test, y_test)*100:.2f}%\n")
    # print("🔝 Top 4 features when using the test set:\n")
    #feature_importance(clf, X_test, y_test, top_limit=4)

In [35]:
# Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    fit_compute_importance(clf)

Logistic Regression classifier

cross-validation Avg accuracy: 0.8107913331974954
Mean accuracy score on the test set: 81.61%

Support Vector rbf classifier

cross-validation Avg accuracy: 0.8107913331974954
Mean accuracy score on the test set: 81.61%

Support Vector sigmoid classifier

cross-validation Avg accuracy: 0.8107913331974954
Mean accuracy score on the test set: 81.61%

KNN classifier

cross-validation Avg accuracy: 0.8111579880858117
Mean accuracy score on the test set: 81.50%

Decision Tree classifier

cross-validation Avg accuracy: 0.7508979057749166
Mean accuracy score on the test set: 74.90%



In [36]:
rfc_st = RandomForestClassifier(random_state=42)
model = fit_grid_model(rfc_st, PARAM_GRID, X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8192797580866856

# adding new feature Early Before

In [95]:
df = pd.read_csv("../sources/raw/dataset_SCL.csv", dtype={'Vlo-O':str, 'Vlo-I':str})
df[FECHAO] = pd.to_datetime(df[FECHAO])
df = (df.pipe(split_date, FECHAI)
        .pipe(add_temporada_alta_flag, dates)
        .pipe(add_periodo_dia_flag, times)
        #.pipe(apply_cohort)
        .pipe(add_dif_min_and_atraso_15)
        .pipe(delete_operational_columns)
        .pipe(previous_flight)
        .pipe(filter_column)
        .pipe(to_bool_tipo_vuelo)
        .pipe(one_hot_encoder) # generate one_hot_encoder
     )

In [96]:
columns = df.columns
columns = columns.tolist()
columns.remove(FECHAI)
df = df[columns]
columns.remove(ATRASO_15)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42,
    )

In [98]:
# Select 4 new classifiers
clfs = {"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99999990 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier()
       }

In [None]:
# Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    fit_compute_importance(clf)

Logistic Regression classifier



In [None]:
df["Early"] = df[DIF_MIN].shift(-1)
df.loc[df["Early"]>=0, 'AnteriorEarly'] = 0
df.loc[df["Early"]<0, 'AnteriorEarly'] = 1

# adding new feature 
parece que está mal el after

In [81]:
df = pd.read_csv("../sources/raw/dataset_SCL.csv", dtype={'Vlo-O':str, 'Vlo-I':str})
df[FECHAO] = pd.to_datetime(df[FECHAO])
df = (df.pipe(split_date, FECHAI)
        .pipe(add_temporada_alta_flag, dates)
        .pipe(add_periodo_dia_flag, times)
        #.pipe(apply_cohort)
        .pipe(add_dif_min_and_atraso_15)
        .pipe(delete_operational_columns)
        .pipe(filter_column)
        .pipe(to_bool_tipo_vuelo)
        .pipe(one_hot_encoder) # generate one_hot_encoder
     )

In [82]:
df.sort_values(FECHAI, inplace = True)
df.reset_index(inplace=True, drop=True)
df['deltaF'] = df[FECHAI].diff()
df['deltaF_s']  = df.deltaF.dt.seconds
df.loc[(df['deltaF_s'].isna()) & (df[ATRASO_15]==1), 'deltaF_s'] = 15*60
df.loc[(df['deltaF_s'].isna()), 'deltaF_s'] = 0
df['deltaF_s']  = df.deltaF_s.astype(int)
df['Anterior'] = 0
df.loc[df.deltaF_s>15*60, 'Anterior']= 1
del df['deltaF']
del df['deltaF_s']

In [83]:
columns = df.columns
columns = columns.tolist()
columns.remove(FECHAI)
df = df[columns]
columns.remove(ATRASO_15)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42,
    )

In [85]:
# Select 4 new classifiers
clfs = {"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99999990 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier()
       }

In [86]:
# Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    fit_compute_importance(clf)

Logistic Regression classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

Support Vector rbf classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

Support Vector sigmoid classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

KNN classifier

cross-validation Avg accuracy: 0.8197345830420986
Mean accuracy score on the test set: 81.42%

Decision Tree classifier

cross-validation Avg accuracy: 0.7469394343468733
Mean accuracy score on the test set: 74.78%



In [77]:
# Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    fit_compute_importance(clf)

Logistic Regression classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

Support Vector rbf classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

Support Vector sigmoid classifier

cross-validation Avg accuracy: 0.8181219592297131
Mean accuracy score on the test set: 81.43%

KNN classifier

cross-validation Avg accuracy: 0.8159226747000174
Mean accuracy score on the test set: 81.28%

Decision Tree classifier

cross-validation Avg accuracy: 0.7457668221924754
Mean accuracy score on the test set: 74.26%



In [78]:
rfc_st = RandomForestClassifier(random_state=42)
model = fit_grid_model(rfc_st, PARAM_GRID, X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8170438926051499