# Experiment 01: Airline dataset

In this experiment we use [the airline dataset](http://kt.ijs.si/elena_ikonomovska/data.html) to predict arrival delay. The dataset consists of a large amount of records, containing flight arrival and departure details for all the commercial flights within the USA, from October 1987 to April 2008. Its size is around 116 million records and 5.76 GB of memory.



In [7]:
import os,sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
                             recall_score, mean_squared_error, mean_absolute_error, r2_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from scipy.stats import gmean

from libs.timer import Timer
from libs.loaders import load_airline
from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric
from libs.conversion import _get_nominal_integer_dict, _convert_to_integer
os.environ['MOUNT_POINT'] = '/fileshare'
print("System version: {}".format(sys.version))

System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


### Data loading and management

In [14]:
%%time
df_plane = load_airline()
print(df_plane.shape)

(115069017, 14)
CPU times: user 1min 34s, sys: 12.2 s, total: 1min 46s
Wall time: 2min 2s


In [15]:
df_plane.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,SFO,ORD,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,LAX,SFO,337,0,5
2,1987,10,1,4,5,35,HP,351,167,ICT,LAS,987,0,17
3,1987,10,1,4,5,40,DL,251,35,MCO,PBI,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,LAS,ORD,1515,0,17


In [16]:
%%time
df_plane_numeric = convert_related_cols_categorical_to_numeric(df_plane, col_list=['Origin','Dest'])


CPU times: user 1min 40s, sys: 11.2 s, total: 1min 51s
Wall time: 1min 51s


In [17]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,HP,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,DL,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,4,33,1515,0,17


In [18]:
%%time
df_plane_numeric = convert_cols_categorical_to_numeric(df_plane_numeric, col_list='UniqueCarrier')


CPU times: user 52.5 s, sys: 8.4 s, total: 1min
Wall time: 1min 1s


In [19]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17


In [20]:
%%time
filter_func = lambda x: 1 if x > 0 else 0
df_plane_numeric['ArrDelayBinary'] = df_plane_numeric['ArrDelay'].map(filter_func)

CPU times: user 35.5 s, sys: 2.44 s, total: 38 s
Wall time: 38.1 s


In [21]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay,ArrDelayBinary
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27,1
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5,1
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17,1
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2,0
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17,1


In [22]:
def split_train_val_test_df(df, val_size=0.2, test_size=0.2):
    train, validate, test = np.split(df.sample(frac=1), 
                                     [int((1-val_size-test_size)*len(df)), int((1-test_size)*len(df))])
    return train, validate, test

In [23]:
%%time
train, validate, test = split_train_val_test_df(df_plane_numeric)
print(train.shape)
print(validate.shape)
print(test.shape)

(69041410, 15)
(23013803, 15)
(23013804, 15)
CPU times: user 1min 12s, sys: 50.8 s, total: 2min 2s
Wall time: 2min 3s


In [24]:
%%time
X_train = train[train.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
X_val = validate[validate.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
X_test = test[test.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
y_train = train['ArrDelayBinary']
y_val = validate['ArrDelayBinary']
y_test = test['ArrDelayBinary']

CPU times: user 1.9 s, sys: 6.66 s, total: 8.56 s
Wall time: 8.53 s


### Training 

In [25]:
pipeline_steps = [('scale', StandardScaler())]
continuous_pipeline = Pipeline(steps=pipeline_steps)
featurisers = [('continuous', continuous_pipeline)]

In [26]:
xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                 ('clf', XGBClassifier(max_depth=8,
                                                      n_estimators=100,
                                                      min_child_weight=30,
                                                      learning_rate=0.1,
                                                      subsample=0.80,
                                                      colsample_bytree=0.80,
                                                      seed=77))])
    
# xgb_clf_pipeline = XGBClassifier(max_depth=8,
#                                n_estimators=100,
#                                min_child_weight=30,
#                                learning_rate=0.1,
#                                subsample=0.80,
#                                colsample_bytree=0.80,
#                                seed=77)

In [29]:
%%time
xgb_clf_pipeline.fit(X_train, y_train, clf__eval_set=[(X_val, y_val)], clf__verbose=True, clf__eval_metric='rmse')

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'] ['ActualElapsedTime', 'CRSArrTime', 'CRSDepTime', 'DayofMonth', 'DayofWeek', 'Dest', 'Distance', 'Diverted', 'FlightNum', 'Month', 'Origin', 'UniqueCarrier', 'Year']
expected f11, f4, f10, f6, f2, f7, f8, f9, f0, f3, f12, f1, f5 in input data
training data did not have the following fields: CRSDepTime, Dest, Origin, Distance, ActualElapsedTime, DayofWeek, FlightNum, UniqueCarrier, CRSArrTime, DayofMonth, Diverted, Year, Month

Training LightGBM classifier

In [None]:
clf = LGBMClassifier(num_leaves=255,
                    n_estimators=100,
                    min_child_weight=30,
                    learning_rate=0.1,
                    nthread=20,
                    subsample=0.80,
                    colsample_bytree=0.80,
                    seed=77)

In [None]:
%%time
lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='rmse')

### Evaluation


In [None]:
%%time
y_prob_xgb = xgb_clf_pipeline.predict_proba(X_test)

In [None]:
%%time
y_prob_lgbm = lgbm_clf_pipeline.predict_proba(X_test)

### Metrics

In [None]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary(y_true, y_pred):
    m_acc = accuracy_score(y_true, y_pred)
    m_f1 = f1_score(y_true, y_pred)
    m_precision = precision_score(y_true, y_pred)
    m_recall = recall_score(y_true, y_pred)
    m_conf = confusion_matrix(y_true, y_pred)
    report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1, 'Confusion Matrix':m_conf}
    return report

In [None]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary_prob(y_true, y_prob):
    m_auc = roc_auc_score(y_true, y_prob)
    m_logloss = log_loss(y_true, y_prob)
    report = {'AUC':m_auc, 'Log loss':m_logloss}
    return report

In [None]:
def binarize_prediction(y, threshold=0.5):
    threshold_func = lambda x: 0 if x<=threshold else 1
    y_pred = y.map(threshold_func)
    return y_pred


In [None]:
y_pred_xgb = binarize_prediction(y_prob_xgb)
y_pred_lgbm = binarize_prediction(y_prob_lgbm)


In [None]:
report1_xgb = classification_metrics_binary(y_test, y_pred_xgb)
print(report1_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)
print(report2_xgb)

In [None]:
report1_xgb = classification_metrics_binary(y_test, y_pred_lgbm)
print(report1_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_lgbm)
print(report2_xgb)