In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import statsmodels.api as sm

from preprocessor import prepare_data, get_features 
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()
features = get_features()

Train data(range):
2019-04-11 21:49:48
2019-06-14 19:06:40
Test data(range):
2019-06-14 19:07:10
2019-08-22 19:05:05


In [67]:
y_train = list(np.array(y_train).flatten())
y_test = list(np.array(y_test).flatten())

## Gradient Boosting Model(s)

#### GradientBoostingRegressor

In [18]:
lower_alpha = 0.1
upper_alpha = 0.9
lower_model = GradientBoostingRegressor(loss="quantile", alpha=lower_alpha, n_estimators=200, max_depth=2)
mid_model = GradientBoostingRegressor(loss="ls", n_estimators=200, max_depth=2)
upper_model = GradientBoostingRegressor(loss="quantile", alpha=upper_alpha, n_estimators=200, max_depth=2)

In [19]:
lower_model.fit(X_train, y_train)
mid_model.fit(X_train, y_train)
upper_model.fit(X_train, y_train)
predictions = pd.DataFrame(y_test)
predictions['lower'] = lower_model.predict(X_test)
predictions['mid'] = mid_model.predict(X_test)
predictions['upper'] = upper_model.predict(X_test)

In [20]:
predictions

Unnamed: 0,0,lower,mid,upper
0,43.55,40.724993,41.240929,57.956430
1,60.30,41.374993,53.254432,60.248332
2,52.45,40.724993,56.316602,77.922834
3,41.20,41.374993,49.818499,60.300004
4,52.45,40.724993,56.656248,77.922834
...,...,...,...,...
12495,90.50,56.425357,78.886848,102.396529
12496,47.30,41.374993,41.951513,60.248332
12497,47.30,41.374993,39.761983,60.300004
12498,60.30,41.374993,41.264174,60.300004


In [23]:
mid_model.score(X_test, y_test)

0.8503742287773248

#### LightLGBM Regressor

In [9]:
lgbm = LGBMRegressor(n_estimators=500)
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test)

0.8985835744395778

In [33]:
lgbm = LGBMRegressor(n_estimators=150, random_state=1)
# rfe = RFE(lgbm, 20)
# x_rfe_train = rfe.fit_transform(X_train, y_train)
# x_rfe_test = rfe.transform(X_test)
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test)

0.8965714902659896

In [34]:
features_imp_df = pd.DataFrame(list(lgbm.feature_importances_), features, columns = ['importance'])
features_imp_df

Unnamed: 0,importance
travel_mins,613
originpop,118
destinationpop,37
days_to_holiday,224
days_from_holiday,183
distance,127
month,151
date,195
hour,716
minute,198


In [35]:
features_imp_df['importance'][4]

183

In [43]:
zeros_ind = []
zeros_feat = []
for i in range(len(features)):
    if features_imp_df['importance'][i] == 0:
        zeros_ind.append(i)
        zeros_feat.append(features_imp_df.iloc[i].name)


[13, 14, 18, 19, 38, 43, 44, 46, 47, 50, 51]

In [47]:
x_train_new = np.delete(X_train, zeros_ind, axis=1)
x_test_new = np.delete(X_test, zeros_ind, axis=1)
features_new = []
for i in range(len(features)):
    if features[i] not in zeros_feat:
        features_new.append(features[i])

In [49]:
lgbm_new = LGBMRegressor(n_estimators=150, random_state=1)
lgbm_new.fit(x_train_new, y_train)
lgbm_new.score(x_test_new, y_test)

0.8965714902659896

In [50]:
features_imp_df_new = pd.DataFrame(list(lgbm_new.feature_importances_), features_new, columns = ['importance'])
features_imp_df_new

Unnamed: 0,importance
travel_mins,613
originpop,118
destinationpop,37
days_to_holiday,224
days_from_holiday,183
distance,127
month,151
date,195
hour,716
minute,198


In [53]:
lgbm_rfe = LGBMRegressor(n_estimators=500, random_state=1)
rfe = RFE(lgbm_rfe, )
x_rfe_train = rfe.fit_transform(x_train_new, y_train)
x_rfe_test = rfe.transform(x_test_new)
lgbm_rfe.fit(x_rfe_train, y_train)
lgbm_rfe.score(x_rfe_test, y_test)

0.8839082627109873

In [68]:
reg = LassoCV()
reg.fit(X_train, y_train)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X_train,y_train))
coef = pd.Series(reg.coef_, index = features)



Best alpha using built-in LassoCV: 0.038388
Best score using built-in LassoCV: 0.846730


travel_mins                         -5.210111
originpop                            0.000000
destinationpop                       0.036691
days_to_holiday                      0.375557
days_from_holiday                   -0.100838
distance                            16.518755
month                                0.000000
date                                 0.168319
hour                                 0.066405
minute                              -0.795344
is_overnight                        -0.944869
days_to_trip                        -2.105170
origin_BARCELONA                     0.769103
origin_MADRID                       -0.000000
origin_PONFERRADA                   -1.065732
origin_SEVILLA                      -2.289379
origin_VALENCIA                      0.000000
destination_BARCELONA                0.967656
destination_MADRID                   0.000000
destination_PONFERRADA              -0.923972
destination_SEVILLA                 -2.446279
destination_VALENCIA              