#### Data import

In [134]:
import pandas as pd

train_df = pd.read_csv('../data/cleaned_train.csv')
test_df = pd.read_csv('../data/cleaned_test.csv')

#### Model import
I tried XGBoost and LGBM

In [135]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error

Checking dataframe

In [136]:
train_df

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,count,date,years,months,days,hours,is_night
0,1,0,0,1,9.84,81,0.0000,16,0,2011,1,1,0,1
1,1,0,0,1,9.02,80,0.0000,40,0,2011,1,1,1,1
2,1,0,0,1,9.02,80,0.0000,32,0,2011,1,1,2,1
3,1,0,0,1,9.84,75,0.0000,13,0,2011,1,1,3,1
4,1,0,0,1,9.84,75,0.0000,1,0,2011,1,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,3,0,1,1,34.44,44,16.9979,533,565,2012,7,19,19,0
8603,3,0,1,1,33.62,49,11.0014,505,565,2012,7,19,20,1
8604,3,0,1,3,27.06,89,16.9979,332,565,2012,7,19,21,1
8605,3,0,1,3,27.06,89,16.9979,68,565,2012,7,19,22,1


I tried oHe with season but the score was worst

#### Train, test split

In [137]:
y_train = train_df['count']
X_train = train_df.drop('count', axis=1)
y_test = test_df['count']
X_test = test_df.drop('count', axis=1)

In [138]:
X_train = np.log1p(X_train)
X_test = np.log1p(X_test)

In [139]:
X_train

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,date,years,months,days,hours,is_night
0,0.693147,0.0,0.000000,0.693147,2.383243,4.406719,0.000000,0.000000,7.606885,0.693147,0.693147,0.000000,0.693147
1,0.693147,0.0,0.000000,0.693147,2.304583,4.394449,0.000000,0.000000,7.606885,0.693147,0.693147,0.693147,0.693147
2,0.693147,0.0,0.000000,0.693147,2.304583,4.394449,0.000000,0.000000,7.606885,0.693147,0.693147,1.098612,0.693147
3,0.693147,0.0,0.000000,0.693147,2.383243,4.330733,0.000000,0.000000,7.606885,0.693147,0.693147,1.386294,0.693147
4,0.693147,0.0,0.000000,0.693147,2.383243,4.330733,0.000000,0.000000,7.606885,0.693147,0.693147,1.609438,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,1.386294,0.0,0.693147,0.693147,3.567841,3.806662,2.890255,6.338594,7.607381,2.079442,2.995732,2.995732,0.000000
8603,1.386294,0.0,0.693147,0.693147,3.544432,3.912023,2.485023,6.338594,7.607381,2.079442,2.995732,3.044522,0.693147
8604,1.386294,0.0,0.693147,1.386294,3.334345,4.499810,2.890255,6.338594,7.607381,2.079442,2.995732,3.091042,0.693147
8605,1.386294,0.0,0.693147,1.386294,3.334345,4.499810,2.890255,6.338594,7.607381,2.079442,2.995732,3.135494,0.693147


In [159]:
xgb = XGBRegressor(learning_rate=0.05, n_estimators=300, objective='reg:squarederror')

#### XGBoost Scores

In [160]:
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

train_score = xgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9764233205039001 
Test:  
 - r2 score:  0.9065076203996825 
 - mae:  44.2149629877711 
 - mse:  4439.575803661079 
 - rmse:  66.6301418553276 
 - mape:  0.30298208059147014 



In [142]:
import pickle
pickle.dump(xgb, open('xgb_best.pkl', 'wb'))

In [143]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [173]:
space={
        # 'max_depth': hp.quniform('max_depth', 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'learning_rate': hp.quniform('learning_rate', 0.05, 0.3, 0.1),
        'n_estimators': 200
    }

In [174]:
def hyperparameter_tuning(space):
    model=XGBRegressor(n_estimators = space['n_estimators'], 
                        # max_depth = int(space['max_depth']), 
                        gamma = int(space['gamma']), 
                        learning_rate = space['learning_rate'],
                        random_state=0)
    
    
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              early_stopping_rounds=10,
              verbose=False)

    pred = model.predict(X_test)
    r2 = r2_score(y_test, pred)
    mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)
    print ("SCORE:", r2)
    
    #change the metric if you like
    return {'loss': - r2, 'mape': mape, 'status': STATUS_OK, 'model': model}

In [175]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=200,
            trials=trials)

print (best)

SCORE:                                                 
0.8909547395739124                                     
SCORE:                                                                            
0.8969468726437336                                                                
SCORE:                                                                            
0.8969468726437336                                                                
SCORE:                                                                            
0.8969468726437336                                                                
SCORE:                                                                            
0.8873446517419076                                                                
SCORE:                                                                            
0.8969468726437336                                                                
SCORE:                                                    

In [176]:
xgb = XGBRegressor(gamma=4.028722020022547, learning_rate=0.2, n_estimators=300)

xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

train_score = xgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9930190553260337 
Test:  
 - r2 score:  0.8422582560101468 
 - mae:  60.16573528962446 
 - mse:  7490.518829860594 
 - rmse:  86.54778350634172 
 - mape:  0.5482105614476233 



In [148]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

etr = ExtraTreesRegressor()
rf = RandomForestRegressor()

In [149]:
etr.fit(X_train, y_train)
pred = etr.predict(X_test)

train_score = etr.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9999999963804518 
Test:  
 - r2 score:  0.8800257738790589 
 - mae:  48.71924528301887 
 - mse:  5697.091823168056 
 - rmse:  75.479082023883 
 - mape:  0.3079211279755814 



In [150]:
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

train_score = rf.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9924353924427036 
Test:  
 - r2 score:  0.8805556445870635 
 - mae:  50.33924089512944 
 - mse:  5671.930401623519 
 - rmse:  75.312219470837 
 - mape:  0.3282972795290122 



Pretty low learning rate, i tried different number a estimators and 200 was generally the best, alpha at 10 : L1 regularization term on weights,
Gamma at 5 : specifies the minimum loss reduction required to make a split.

In [151]:
from sklearn.model_selection import learning_curve, StratifiedKFold, cross_val_score
import numpy as np

skf = StratifiedKFold(n_splits=5)
print(f'Cross val score: ', np.mean(cross_val_score(xgb, X_train, y_train, cv=skf)))



Cross val score:  0.9260979273215296


In [152]:
# import matplotlib.pyplot as plt


# N, train_score, val_score = learning_curve(xgb, X_train, y_train, train_sizes = np.linspace(0.1,1,50), cv=skf, scoring='r2')
# plt.plot(N, val_score.mean(axis=1), label='validation')
# plt.plot(N, train_score.mean(axis=1), label='train')

# plt.xlabel('train_sizes')
# plt.title('Xgb learning curve')
# plt.legend()
# plt.show() 