In [1]:
import imp
import numpy as np
import pandas as pd
import xgboost as xgb
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.base import BaseEstimator
import xgboost as xgb
from lightgbm import LGBMRegressor
plt.rcParams['figure.figsize'] = [20, 20]

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
problem = imp.load_source('', 'problem.py')
X_df, y_array = problem.get_train_data()
X_df.head()

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd
0,2012-06-19,ORD,DFW,12.875,9.812647
1,2012-09-10,LAS,DEN,14.285714,9.466734
2,2012-10-05,DEN,LAX,10.863636,9.035883
3,2011-10-09,ATL,ORD,11.48,7.990202
4,2012-02-21,DEN,SFO,11.45,9.517159


In [3]:
ext_data = pd.read_csv('submissions/starting_kit/external_data.csv')
ext_data.head()

Unnamed: 0,Departure,Arrival,Distance,DateOfDeparture,Max TemperatureC,Mean TemperatureC,Min TemperatureC,Dew PointC,MeanDew PointC,Min DewpointC,Max Humidity,Mean Humidity,Min Humidity,Max Sea Level PressurehPa,Mean Sea Level PressurehPa,Min Sea Level PressurehPa,Max VisibilityKm,Mean VisibilityKm,Min VisibilitykM,Max Wind SpeedKm/h,Mean Wind SpeedKm/h,Max Gust SpeedKm/h,Precipitationmm,CloudCover,Events,WindDirDegrees,Number_hab,Revenue,Oil_price,dep_encod,ar_encod
0,LAS,ORD,2434,2011-09-01,34,28,22,23,22,19,84,70,56,1015,1013,1011,16,15,11,26,9,37.0,0.0,1,,197,2712920,52273,88.93,10.79,11.26
1,LAS,ORD,2434,2011-09-02,33,29,24,22,21,19,79,64,49,1013,1012,1011,16,15,11,27,12,37.0,0.0,4,,215,2712920,52273,86.57,10.79,11.26
2,LAS,ORD,2434,2011-09-03,32,26,21,22,20,18,93,70,46,1013,1012,1011,16,15,8,32,10,39.0,0.51,6,Rain-Thunderstorm,270,2712920,52273,93.79,10.79,11.26
3,LAS,ORD,2434,2011-09-04,23,19,14,19,13,9,93,70,47,1017,1013,1010,16,16,16,35,19,45.0,0.0,5,,312,2712920,52273,93.79,10.79,11.26
4,LAS,ORD,2434,2011-09-05,18,15,11,10,7,6,77,64,51,1021,1020,1017,16,16,16,42,24,56.0,0.0,3,,5,2712920,52273,83.52,10.79,11.26


In [4]:
X_encoded = X_df
external_data = ext_data[['DateOfDeparture', 'Departure', 'Arrival', 'Distance',
                                'dep_encod', 'ar_encod', 'Revenue']]
X_encoded = pd.merge(
            X_encoded, external_data, how='left',
            left_on=['DateOfDeparture', 'Arrival', 'Departure'],
            right_on=['DateOfDeparture', 'Arrival', 'Departure'],
            sort=False)

X_encoded.head()

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,Distance,dep_encod,ar_encod,Revenue
0,2012-06-19,ORD,DFW,12.875,9.812647,1292,11.29,11.09,45616
1,2012-09-10,LAS,DEN,14.285714,9.466734,1010,10.79,10.68,52262
2,2012-10-05,DEN,LAX,10.863636,9.035883,1387,10.68,11.43,57739
3,2011-10-09,ATL,ORD,11.48,7.990202,977,10.99,11.26,52273
4,2012-02-21,DEN,SFO,11.45,9.517159,1554,10.68,11.23,57739


In [5]:
X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Departure']))
X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Arrival'], prefix='a'))
X_encoded = X_encoded.drop('Departure', axis=1)
X_encoded = X_encoded.drop('Arrival', axis=1)

X_encoded.loc[:, 'ATL': 'SFO'] = X_encoded.loc[:, 'ATL': 'SFO'].mul(X_encoded.loc[:, 'dep_encod'], axis='rows')
arr = X_encoded.loc[:, 'a_ATL': 'a_SFO'].mul(- 1 * X_encoded.loc[:, 'ar_encod'], axis='rows')

arr.columns = X_encoded.loc[:, 'ATL': 'SFO'].columns
X_encoded.loc[:, 'ATL': 'SFO'] = X_encoded.loc[:, 'ATL': 'SFO'].add(arr, axis=1)

X_encoded.drop(['dep_encod', 'ar_encod'], axis=1, inplace=True)
X_encoded.drop(X_encoded.loc[:, 'a_ATL': 'a_SFO'].columns, axis=1, inplace=True)
X_encoded.head()

Unnamed: 0,DateOfDeparture,WeeksToDeparture,std_wtd,Distance,Revenue,ATL,BOS,CLT,DEN,DFW,DTW,EWR,IAH,JFK,LAS,LAX,LGA,MCO,MIA,MSP,ORD,PHL,PHX,SEA,SFO
0,2012-06-19,12.875,9.812647,1292,45616,0.0,0.0,0.0,0.0,-11.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.29,0.0,0.0,0.0,0.0
1,2012-09-10,14.285714,9.466734,1010,52262,0.0,0.0,0.0,-10.68,0.0,0.0,0.0,0.0,0.0,10.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2012-10-05,10.863636,9.035883,1387,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,-11.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011-10-09,11.48,7.990202,977,52273,10.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.26,0.0,0.0,0.0,0.0
4,2012-02-21,11.45,9.517159,1554,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.23


In [6]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [7]:
X_encoded['DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
X_encoded['year'] = X_encoded['DateOfDeparture'].dt.year
X_encoded['month'] = X_encoded['DateOfDeparture'].dt.month
X_encoded['day'] = X_encoded['DateOfDeparture'].dt.day
X_encoded['weekday'] = X_encoded['DateOfDeparture'].dt.weekday
X_encoded['week'] = X_encoded['DateOfDeparture'].dt.week
X_encoded['n_days'] = X_encoded['DateOfDeparture'].apply(lambda date: (date - pd.to_datetime("2011-09-01")).days)
X_encoded.head()

Unnamed: 0,DateOfDeparture,WeeksToDeparture,std_wtd,Distance,Revenue,ATL,BOS,CLT,DEN,DFW,DTW,EWR,IAH,JFK,LAS,LAX,LGA,MCO,MIA,MSP,ORD,PHL,PHX,SEA,SFO,year,month,day,weekday,week,n_days
0,2012-06-19,12.875,9.812647,1292,45616,0.0,0.0,0.0,0.0,-11.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.29,0.0,0.0,0.0,0.0,2012,6,19,1,25,292
1,2012-09-10,14.285714,9.466734,1010,52262,0.0,0.0,0.0,-10.68,0.0,0.0,0.0,0.0,0.0,10.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,9,10,0,37,375
2,2012-10-05,10.863636,9.035883,1387,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,-11.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,10,5,4,40,400
3,2011-10-09,11.48,7.990202,977,52273,10.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.26,0.0,0.0,0.0,0.0,2011,10,9,6,40,38
4,2012-02-21,11.45,9.517159,1554,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.23,2012,2,21,1,8,173


In [8]:
X_encoded = encode(X_encoded, 'month', 12)
X_encoded = encode(X_encoded, 'week', 52)
X_encoded = encode(X_encoded, 'day', 365)
X_encoded = encode(X_encoded, 'weekday', 6)
X_encoded = X_encoded.drop(['DateOfDeparture', 'month', 'day', 'weekday', 'week', 'std_wtd'], axis=1)
X_encoded.head()

Unnamed: 0,WeeksToDeparture,Distance,Revenue,ATL,BOS,CLT,DEN,DFW,DTW,EWR,IAH,JFK,LAS,LAX,LGA,MCO,MIA,MSP,ORD,PHL,PHX,SEA,SFO,year,n_days,month_sin,month_cos,week_sin,week_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,12.875,1292,45616,0.0,0.0,0.0,0.0,-11.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.29,0.0,0.0,0.0,0.0,2012,292,1.224647e-16,-1.0,0.120537,-0.992709,0.32127,0.946988,0.8660254,0.5
1,14.285714,1010,52262,0.0,0.0,0.0,-10.68,0.0,0.0,0.0,0.0,0.0,10.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,375,-1.0,-1.83697e-16,-0.970942,-0.239316,0.171293,0.98522,0.0,1.0
2,10.863636,1387,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,-11.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,400,-0.8660254,0.5,-0.992709,0.120537,0.085965,0.996298,-0.8660254,-0.5
3,11.48,977,52273,10.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.26,0.0,0.0,0.0,0.0,2011,38,-0.8660254,0.5,-0.992709,0.120537,0.154309,0.988023,-2.449294e-16,1.0
4,11.45,1554,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.23,2012,173,0.8660254,0.5,0.822984,0.568065,0.353676,0.935368,0.8660254,0.5


In [9]:
X_encoded.loc[X_encoded['WeeksToDeparture'] <= 9.524, 'WeeksToDeparture'] = 0
X_encoded.loc[(X_encoded['WeeksToDeparture'] > 9.524) & (X_encoded['WeeksToDeparture'] <= 11.3), 'WeeksToDeparture'] = 1
X_encoded.loc[(X_encoded['WeeksToDeparture'] > 11.3) & (X_encoded['WeeksToDeparture'] <= 13.24), 'WeeksToDeparture'] = 2
X_encoded.loc[ X_encoded['WeeksToDeparture'] > 13.24, 'WeeksToDeparture'] = 3
X_encoded['WeeksToDeparture'] = X_encoded['WeeksToDeparture'].astype(int)
X_encoded.head()

Unnamed: 0,WeeksToDeparture,Distance,Revenue,ATL,BOS,CLT,DEN,DFW,DTW,EWR,IAH,JFK,LAS,LAX,LGA,MCO,MIA,MSP,ORD,PHL,PHX,SEA,SFO,year,n_days,month_sin,month_cos,week_sin,week_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,2,1292,45616,0.0,0.0,0.0,0.0,-11.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.29,0.0,0.0,0.0,0.0,2012,292,1.224647e-16,-1.0,0.120537,-0.992709,0.32127,0.946988,0.8660254,0.5
1,3,1010,52262,0.0,0.0,0.0,-10.68,0.0,0.0,0.0,0.0,0.0,10.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,375,-1.0,-1.83697e-16,-0.970942,-0.239316,0.171293,0.98522,0.0,1.0
2,1,1387,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,-11.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012,400,-0.8660254,0.5,-0.992709,0.120537,0.085965,0.996298,-0.8660254,-0.5
3,2,977,52273,10.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.26,0.0,0.0,0.0,0.0,2011,38,-0.8660254,0.5,-0.992709,0.120537,0.154309,0.988023,-2.449294e-16,1.0
4,2,1554,57739,0.0,0.0,0.0,10.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.23,2012,173,0.8660254,0.5,0.822984,0.568065,0.353676,0.935368,0.8660254,0.5


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_array, test_size=0.2, random_state=0)

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LarsCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.linear_model import LassoLars, LassoLarsCV, Ridge, RidgeCV

from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

In [22]:
models = []
models.append(("RdmF", RandomForestRegressor(n_estimators=50, max_depth=30, max_features=5) ))
models.append(("RidCV", Ridge(alpha= 0.001) ))
models.append(("LGBM", LGBMRegressor(num_leaves=30,
                    boosting_type='gbdt',
                    objective='regression',
                    learning_rate=0.1,
                    max_depth=-1,
                    n_estimators=400,
                    bagging_fraction=0.52,
                    feature_fraction=0.63,
                    max_bin=255) ))
models.append(("LasCV", Lasso(alpha=1e-4, normalize=True, max_iter=1e5) ))
models.append(("ElNCV", ElasticNetCV() ))
models.append(("LaLaCV", LassoLarsCV() ))
models.append(("XGB", xgb.XGBRegressor(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.16, max_delta_step=0,
       max_depth=7, min_child_weight=5, missing=None, n_estimators=2450,
       n_jobs=1, nthread=4, objective='reg:squarederror', random_state=0,
       reg_alpha=0.001, reg_lambda=0, seed=None,
       silent=True, subsample=.9) ))

In [23]:
kfold = KFold(n_splits=10)
 
def getCVResult(models, X_learning, Y_learning):
  for name, model in models:
     cv_results = cross_val_score(model, X_learning, Y_learning, scoring='neg_mean_squared_error', cv=kfold)
     rmsd_scores = np.sqrt(-cv_results)
     print("\n[%s] Mean: %.8f Std. Dev.: %8f" %(name, rmsd_scores.mean(), rmsd_scores.std()))
 
getCVResult(models, X_encoded, y_array)


[RdmF] Mean: 0.49317011 Std. Dev.: 0.025922

[RidCV] Mean: 0.90317637 Std. Dev.: 0.023295

[LGBM] Mean: 0.33999221 Std. Dev.: 0.023688

[LasCV] Mean: 0.91087210 Std. Dev.: 0.024711





[ElNCV] Mean: 0.98022149 Std. Dev.: 0.033017





[LaLaCV] Mean: 0.90331161 Std. Dev.: 0.023232


KeyboardInterrupt: 