# LightGBM model with Airline dataset. Experiment 01

LightGBM regressor with L2 + Airline dataset

In [34]:
import os,sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm.sklearn import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold
from scipy.stats import gmean

from libs.timer import Timer
from libs.loaders import load_fraud, load_iot, load_airline
from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric
from libs.conversion import _get_nominal_integer_dict, _convert_to_integer

print("System version: {}".format(sys.version))

System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


### Load data

In [2]:
%%time
df_plane = load_airline()
print(df_plane.shape)


MOUNT_POINT not found in environment. Defaulting to /fileshare
(115069017, 14)
CPU times: user 1min 40s, sys: 14.7 s, total: 1min 55s
Wall time: 3min 35s


In [3]:
df_plane.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,SFO,ORD,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,LAX,SFO,337,0,5
2,1987,10,1,4,5,35,HP,351,167,ICT,LAS,987,0,17
3,1987,10,1,4,5,40,DL,251,35,MCO,PBI,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,LAS,ORD,1515,0,17


In [4]:
%%time
df_plane_numeric = convert_related_cols_categorical_to_numeric(df_plane, col_list=['Origin','Dest'])


CPU times: user 2min 8s, sys: 7.82 s, total: 2min 16s
Wall time: 2min 13s


In [5]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,AA,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,EA,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,HP,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,DL,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,UA,500,208,4,33,1515,0,17


In [6]:
%%time
df_plane_numeric = convert_cols_categorical_to_numeric(df_plane_numeric, col_list='UniqueCarrier')


CPU times: user 1min 8s, sys: 7.12 s, total: 1min 15s
Wall time: 1min 13s


In [7]:
df_plane_numeric.head()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay
0,1987,10,1,4,1,556,0,190,247,0,33,1846,0,27
1,1987,10,1,4,5,114,1,57,74,1,0,337,0,5
2,1987,10,1,4,5,35,2,351,167,2,4,987,0,17
3,1987,10,1,4,5,40,3,251,35,3,41,142,0,-2
4,1987,10,1,4,8,517,4,500,208,4,33,1515,0,17


In [8]:
#gets a random 80% of the entire set
X_train = df_plane_numeric.sample(frac=0.8, random_state=1)
print(X_train.shape)
#gets the left out portion of the dataset
X_test = df_plane_numeric.loc[~df_plane_numeric.index.isin(X_train.index)]
print(X_test.shape)

(92055214, 14)
(23013803, 14)


### Training

In [9]:
# Variables
KFOLD_SPLITS = 5

In [18]:
def train_lightgbm(df, verbose=True):
    if verbose: print("Training data of size {}".format(df.shape))
    x = df.ix[:, df.columns != 'ArrDelay'].as_matrix()
    y = df['ArrDelay'].as_matrix()
    
    n_splits = 5
    skf = KFold(n_splits=n_splits, random_state=77, shuffle=True)
    clfs = []
    
    if verbose: print("Computing LightGBM boosted tree using {} kfold cross validation".format(n_splits))
    for train_index, test_index in skf.split(x, y):
        trn_x, val_x = x[train_index,:], x[test_index,:]
        trn_y, val_y = y[train_index], y[test_index]

        clf = LGBMRegressor(max_depth=6,
                            num_leaves=21,
                            n_estimators=300,
                            min_child_weight=30,
                            learning_rate=0.01,
                            nthread=20,
                            boosting_type='gbdt',
                            subsample=0.80,
                            colsample_bytree=0.80,
                            seed=77)
        clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='l2', early_stopping_rounds=300)
        clfs.append(clf)
        
    return clfs

In [16]:
X_train_small = X_train.sample(frac=0.001, random_state=1)
print(X_train_small.shape)

(92055, 14)


In [32]:
%%time
clfs = train_lightgbm(X_train)

Training data of size (92055214, 14)
Computing LightGBM boosted tree using 5 kfold cross validation




[1]	valid_0's l2: 31.7265
Train until valid scores didn't improve in 300 rounds.
[2]	valid_0's l2: 31.7051
[3]	valid_0's l2: 31.6665
[4]	valid_0's l2: 31.6461
[5]	valid_0's l2: 31.6083
[6]	valid_0's l2: 31.5714
[7]	valid_0's l2: 31.5361
[8]	valid_0's l2: 31.5145
[9]	valid_0's l2: 31.4956
[10]	valid_0's l2: 31.4596
[11]	valid_0's l2: 31.4416
[12]	valid_0's l2: 31.4067
[13]	valid_0's l2: 31.3721
[14]	valid_0's l2: 31.3352
[15]	valid_0's l2: 31.3025
[16]	valid_0's l2: 31.271
[17]	valid_0's l2: 31.2414
[18]	valid_0's l2: 31.2112
[19]	valid_0's l2: 31.1836
[20]	valid_0's l2: 31.1529
[21]	valid_0's l2: 31.1375
[22]	valid_0's l2: 31.1086
[23]	valid_0's l2: 31.08
[24]	valid_0's l2: 31.0637
[25]	valid_0's l2: 31.0358
[26]	valid_0's l2: 31.0078
[27]	valid_0's l2: 30.9936
[28]	valid_0's l2: 30.9683
[29]	valid_0's l2: 30.9548
[30]	valid_0's l2: 30.9311
[31]	valid_0's l2: 30.9002
[32]	valid_0's l2: 30.875
[33]	valid_0's l2: 30.85
[34]	valid_0's l2: 30.8357
[35]	valid_0's l2: 30.8139
[36]	valid_0's 

### Evaluation

In [29]:
def compute_prediction(clfs, df, verbose=True):    
    x = df.ix[:, df.columns != 'ArrDelay'].as_matrix() 
            
    preds = []
    with Timer() as t:
        for clf in clfs:
            preds.append(clf.predict(x))
        pred = gmean(np.array(preds), axis=0)
    if verbose: print("Prediction took %.03f sec.\n" % t.interval)        
    df['prediction'] = pred
    return df

In [27]:
X_test_small = X_test.sample(frac=0.001, random_state=1)
print(X_test_small.shape)

(23014, 14)


In [33]:
X_test_pred = compute_prediction(clfs, X_test)
X_test_pred.head(50)

  log_a = np.log(a)


Prediction took 2858.137 sec.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Year,Month,DayofMonth,DayofWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,Origin,Dest,Distance,Diverted,ArrDelay,prediction
15,1987,10,1,4,15,47,4,3,32,5,21,75,0,-8,2.250709
22,1987,10,1,4,25,227,0,2199,98,0,8,679,0,-24,2.026823
26,1987,10,1,4,30,120,2,721,56,4,22,256,0,24,2.078405
29,1987,10,1,4,30,526,6,420,181,4,97,1372,0,5,1.738886
31,1987,10,1,4,30,602,8,810,205,16,31,1605,0,-7,1.578745
32,1987,10,1,4,30,750,2,326,256,4,60,2106,0,4,3.170307
33,1987,10,1,4,31,601,0,110,212,7,10,1616,0,0,1.468442
41,1987,10,1,4,38,8,1,436,34,18,42,111,0,4,2.224849
42,1987,10,1,4,39,106,1,291,34,19,141,97,0,7,2.149388
46,1987,10,1,4,45,12,3,886,42,18,42,111,0,20,2.2689
