# LightGBM model with Airline dataset. Experiment 02

Trying to show concept drift

In [39]:
import os,sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
                             recall_score, mean_squared_error, mean_absolute_error, r2_score)
from scipy.stats import gmean

from libs.timer import Timer
from libs.loaders import load_fraud, load_iot, load_airline
from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric
from libs.conversion import _get_nominal_integer_dict, _convert_to_integer
print("System version: {}".format(sys.version))
from collections import OrderedDict

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


### Load data

In [5]:
%%time
df_plane = load_airline()
print(df_plane.shape)


(115069017, 14)
CPU times: user 1min 39s, sys: 25.4 s, total: 2min 4s
Wall time: 4min 27s


In [6]:
df_plane.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,SFO,ORD,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,LAX,SFO,337,0,5
2,1987,10,1,4,5,35,HP,351,167,ICT,LAS,987,0,17
3,1987,10,1,4,5,40,DL,251,35,MCO,PBI,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,LAS,ORD,1515,0,17


In [7]:
%%time
df_plane_numeric = convert_related_cols_categorical_to_numeric(df_plane, col_list=['Origin','Dest'])


CPU times: user 1min 55s, sys: 12.1 s, total: 2min 7s
Wall time: 2min 11s


In [8]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,HP,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,DL,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,4,33,1515,0,17


In [9]:
%%time
df_plane_numeric = convert_cols_categorical_to_numeric(df_plane_numeric, col_list='UniqueCarrier')


CPU times: user 1min 3s, sys: 17.2 s, total: 1min 20s
Wall time: 1min 22s


In [10]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17


In [11]:
%%time
filter_func = lambda x: 1 if x > 0 else 0
df_plane_numeric['ArrDelayBinary'] = df_plane_numeric['ArrDelay'].map(filter_func)

CPU times: user 42.6 s, sys: 10.1 s, total: 52.7 s
Wall time: 53.6 s


In [12]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay,ArrDelayBinary
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27,1
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5,1
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17,1
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2,0
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17,1


In [14]:
def get_data_list_yearly(df):
    data_yearly = [df[df['Year'] == year] for year in range(1987, 2008)]  
    return data_yearly

In [15]:
%%time
data_yearly_list = get_data_list_yearly(df_plane_numeric)

CPU times: user 13.1 s, sys: 9.78 s, total: 22.9 s
Wall time: 22.7 s


In [17]:
for subset in data_yearly_list:
    print(subset.shape)


(1287333, 15)
(5126498, 15)
(290827, 15)
(5110527, 15)
(4995005, 15)
(5020651, 15)
(4993587, 15)
(5078411, 15)
(5219140, 15)
(5209326, 15)
(5301999, 15)
(5227051, 15)
(5360018, 15)
(5481303, 15)
(5723673, 15)
(5197860, 15)
(6375689, 15)
(6987729, 15)
(6992838, 15)
(7003802, 15)
(7275288, 15)


In [18]:
total_subsets = len(data_yearly_list)
num_ini = 5

In [19]:
def generate_feables(df):
    X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
    y = df['ArrDelayBinary']
    return X,y

In [33]:
def generate_subset(data_yearly_list, num):
    subset = data_yearly_list[0]
    for i in range(1,num):
        subset = pd.concat([subset, data_yearly_list[i]])
    return subset

In [21]:
%%time
subset_base = generate_subset(data_yearly_list, num_ini)
print(subset_base.shape)

(16810190, 15)


In [23]:
clf = LGBMClassifier(num_leaves=255,
                    n_estimators=100,
                    min_child_weight=30,
                    learning_rate=0.1,
                    subsample=0.80,
                    colsample_bytree=0.80,
                    seed=42)

In [24]:
X_train, y_train = generate_feables(subset_base)

In [25]:
%%time
clf.fit(X_train, y_train)

CPU times: user 30min 47s, sys: 6min 41s, total: 37min 29s
Wall time: 1min 40s


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=30,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
        seed=42, sigmoid=1.0, silent=True, skip_drop=0.5, subsample=0.8,
        subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,
        xgboost_dart_mode=False)

In [40]:
def predict_accuracy_future_years(clf, data_yearly_list, num_ini):
    total_subsets = len(data_yearly_list)
    accuracy_dict = OrderedDict()
    for y in range(num_ini, total_subsets):
        print("Predicting year {}".format(y))
        X_test, y_test = generate_feables(data_yearly_list[y])
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        year = data_yearly_list[y]['Year'][0]
        accuracy_dict[year] = acc
    return accuracy_dict

In [28]:
%%time
accuracy_dict = predict_accuracy_future_years(clf, data_yearly_list, num_ini)
print(accuracy_dict)

CPU times: user 3h 55min 31s, sys: 4h 56min 21s, total: 8h 51min 52s
Wall time: 23min 23s


In [29]:
accuracy_dict

{5: 0.75691698148307862,
 6: 0.75626698803885861,
 7: 0.74442970448827395,
 8: 0.73156497047406277,
 9: 0.72295149122938362,
 10: 0.72072118459471601,
 11: 0.70637133634242333,
 12: 0.70094036997636944,
 13: 0.68935178369084871,
 14: 0.67476234229313936,
 15: 0.68108683188850794,
 16: 0.68797458596239558,
 17: 0.68237305711197438,
 18: 0.67569633387760453,
 19: 0.66551624389153208,
 20: 0.65337922017657579}

In [31]:
new_init = 15

In [34]:
%%time
subset_retrain = generate_subset(data_yearly_list, new_init)
print(subset_retrain.shape)

(69425349, 15)
CPU times: user 10.6 s, sys: 25.1 s, total: 35.8 s
Wall time: 35.8 s


In [35]:
X_train, y_train = generate_feables(subset_retrain)

In [36]:
clf_retrain = LGBMClassifier(num_leaves=255,
                    n_estimators=100,
                    min_child_weight=30,
                    learning_rate=0.1,
                    subsample=0.80,
                    colsample_bytree=0.80,
                    seed=42)

In [37]:
%%time
clf_retrain.fit(X_train, y_train)

CPU times: user 2h 5min 57s, sys: 28min 39s, total: 2h 34min 36s
Wall time: 7min 7s


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=30,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
        seed=42, sigmoid=1.0, silent=True, skip_drop=0.5, subsample=0.8,
        subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,
        xgboost_dart_mode=False)

In [38]:
%%time
accuracy_retrain = predict_accuracy_future_years(clf_retrain, data_yearly_list, new_init)
print(accuracy_retrain)

{16: 0.74897897309608419, 17: 0.72488715575546792, 18: 0.71915794417087886, 19: 0.70686721297946453, 20: 0.69523763182983267, 15: 0.75091210613598669}
CPU times: user 1h 46min 32s, sys: 2h 6min 58s, total: 3h 53min 31s
Wall time: 10min 13s
