# XGBoost model with Airline dataset. Experiment 01

XGBoost regressor with L2 + Airline dataset

In [1]:
import os,sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm.sklearn import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
                             recall_score, mean_squared_error, mean_absolute_error, r2_score)
from scipy.stats import gmean

from libs.timer import Timer
from libs.loaders import load_fraud, load_iot, load_airline
from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric
from libs.conversion import _get_nominal_integer_dict, _convert_to_integer

print("System version: {}".format(sys.version))

System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]




### Load data

In [2]:
%%time
df_plane = load_airline()
print(df_plane.shape)


MOUNT_POINT not found in environment. Defaulting to /fileshare
(115069017, 14)
CPU times: user 1min 32s, sys: 15.5 s, total: 1min 48s
Wall time: 4min 18s


In [3]:
df_plane.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,SFO,ORD,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,LAX,SFO,337,0,5
2,1987,10,1,4,5,35,HP,351,167,ICT,LAS,987,0,17
3,1987,10,1,4,5,40,DL,251,35,MCO,PBI,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,LAS,ORD,1515,0,17


In [4]:
%%time
df_plane_numeric = convert_related_cols_categorical_to_numeric(df_plane, col_list=['Origin','Dest'])


CPU times: user 1min 41s, sys: 10.1 s, total: 1min 51s
Wall time: 1min 51s


In [5]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,HP,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,DL,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,4,33,1515,0,17


In [6]:
%%time
df_plane_numeric = convert_cols_categorical_to_numeric(df_plane_numeric, col_list='UniqueCarrier')


CPU times: user 53.3 s, sys: 7.52 s, total: 1min
Wall time: 1min


In [7]:
df_plane.dtypes

Year                  int64
Month                 int64
DayofMonth            int64
DayofWeek             int64
CRSDepTime            int64
CRSArrTime            int64
UniqueCarrier        object
FlightNum             int64
ActualElapsedTime     int64
Origin               object
Dest                 object
Distance              int64
Diverted              int64
ArrDelay              int64
dtype: object

In [8]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17


In [9]:
%%time
filter_func = lambda x: 1 if x > 0 else 0
df_plane_numeric['ArrDelayBinary'] = df_plane_numeric['ArrDelay'].map(filter_func)

CPU times: user 35.7 s, sys: 2.76 s, total: 38.4 s
Wall time: 38.4 s


In [10]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay,ArrDelayBinary
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27,1
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5,1
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17,1
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2,0
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17,1


In [11]:
%%time
#gets a random 80% of the entire set
X_train = df_plane_numeric.sample(frac=0.8, random_state=1)
print(X_train.shape)
#gets the left out portion of the dataset
X_test = df_plane_numeric.loc[~df_plane_numeric.index.isin(X_train.index)]
print(X_test.shape)

(92055214, 15)
(23013803, 15)
CPU times: user 1min 28s, sys: 35.6 s, total: 2min 4s
Wall time: 2min 4s


### Training

In [12]:
# Variables
KFOLD_SPLITS = 5

In [13]:
def train_xgboost(df, verbose=True):
    if verbose: print("Training data of size {}".format(df.shape))
    x = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])].as_matrix()
    y = df['ArrDelayBinary'].as_matrix()
    
    n_splits = 5
    skf = KFold(n_splits=n_splits, random_state=77, shuffle=True)
    clfs = []
    
    if verbose: print("Computing boosted tree using {} kfold cross validation".format(n_splits))
    for train_index, test_index in skf.split(x, y):
        trn_x, val_x = x[train_index,:], x[test_index,:]
        trn_y, val_y = y[train_index], y[test_index]

        
        
        clf = xgb.XGBRegressor(max_depth=6,
                            n_estimators=300,
                            min_child_weight=30,
                            learning_rate=0.01,
                            nthread=20,
                            subsample=0.80,
                            colsample_bytree=0.80,
                            seed=77)
        clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='rmse', early_stopping_rounds=100)
        clfs.append(clf)
        
    return clfs

In [14]:
%%time
X_train_small = X_train.sample(frac=0.001, random_state=1)
print(X_train_small.shape)

(92055, 15)
CPU times: user 3.88 s, sys: 276 ms, total: 4.16 s
Wall time: 4.17 s


In [15]:
%%time
#clfs = train_xgboost(X_train_small)
clfs = train_xgboost(X_train)

Training data of size (92055214, 15)
Computing LightGBM boosted tree using 5 kfold cross validation
[0]	validation_0-rmse:0.4995
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.499002
[2]	validation_0-rmse:0.498793
[3]	validation_0-rmse:0.498589
[4]	validation_0-rmse:0.498388
[5]	validation_0-rmse:0.498199
[6]	validation_0-rmse:0.498072
[7]	validation_0-rmse:0.497594
[8]	validation_0-rmse:0.497464
[9]	validation_0-rmse:0.497276
[10]	validation_0-rmse:0.497169
[11]	validation_0-rmse:0.49705
[12]	validation_0-rmse:0.496869
[13]	validation_0-rmse:0.496409
[14]	validation_0-rmse:0.495945
[15]	validation_0-rmse:0.495553
[16]	validation_0-rmse:0.49518
[17]	validation_0-rmse:0.494736
[18]	validation_0-rmse:0.494303
[19]	validation_0-rmse:0.494185
[20]	validation_0-rmse:0.493797
[21]	validation_0-rmse:0.493426
[22]	validation_0-rmse:0.493009
[23]	validation_0-rmse:0.492672
[24]	validation_0-rmse:0.49228
[25]	validation_0-rmse:0.492142
[26]	validation_0

### Evaluation

In [16]:
def compute_prediction(clfs, df, verbose=True):    
    x = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])].as_matrix()
    if verbose: print(x.shape)
    preds = []
    with Timer() as t:
        for clf in clfs:
            preds.append(np.clip(clf.predict(x), 0.0001, 0.9999))
        pred = gmean(np.array(preds), axis=0)
    if verbose: print("Prediction took %.03f sec.\n" % t.interval) 
    df['prediction'] = pred
    return df

In [17]:
X_test_small = X_test.sample(frac=0.001, random_state=1)
print(X_test_small.shape)

(23014, 15)


In [18]:
%%time
#X_test_pred = compute_prediction(clfs, X_test_small)
X_test_pred = compute_prediction(clfs, X_test)
X_test_pred.head(20)

(23013803, 13)
Prediction took 2707.423 sec.

CPU times: user 44min 53s, sys: 14.5 s, total: 45min 8s
Wall time: 2min 39s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Metrics

In [19]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary(y_true, y_pred):
    m_acc = accuracy_score(y_true, y_pred)
    m_f1 = f1_score(y_true, y_pred)
    m_precision = precision_score(y_true, y_pred)
    m_recall = recall_score(y_true, y_pred)
    m_conf = confusion_matrix(y_true, y_pred)
    report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1, 'Confusion Matrix':m_conf}
    return report

In [20]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary_prob(y_true, y_prob):
    m_auc = roc_auc_score(y_true, y_prob)
    m_logloss = log_loss(y_true, y_prob)
    report = {'AUC':m_auc, 'Log loss':m_logloss}
    return report

In [21]:
THRES = 0.5
threshold_func = lambda x: 0 if x<=THRES else 1


In [22]:
y_true = X_test_pred['ArrDelayBinary'].as_matrix()
y_prob = X_test_pred['prediction'].as_matrix()
y_pred = X_test_pred['prediction'].map(threshold_func).as_matrix()

In [23]:
report1 = classification_metrics_binary(y_true, y_pred)
report1

{'Accuracy': 0.68739360461198007,
 'Confusion Matrix': array([[9490397, 2602902],
        [4591360, 6329144]]),
 'F1': 0.63761521819615108,
 'Precision': 0.70858837941497388,
 'Recall': 0.57956519222922309}

In [24]:
report2 = classification_metrics_binary_prob(y_true, y_prob)
report2

{'AUC': 0.75291480971209213, 'Log loss': 0.60798314423672484}