#### Data import

In [1]:
import pandas as pd

train_df = pd.read_csv('../data/cleaned_train.csv')
test_df = pd.read_csv('../data/cleaned_test.csv')

#### Model import
I tried XGBoost and LGBM

In [2]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error

Checking dataframe

In [3]:
train_df

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,count,date,years,months,days,hours,is_night
0,1,0,0,1,9.84,81,0.0000,16,0,2011,1,1,0,1
1,1,0,0,1,9.02,80,0.0000,40,0,2011,1,1,1,1
2,1,0,0,1,9.02,80,0.0000,32,0,2011,1,1,2,1
3,1,0,0,1,9.84,75,0.0000,13,0,2011,1,1,3,1
4,1,0,0,1,9.84,75,0.0000,1,0,2011,1,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,3,0,1,1,34.44,44,16.9979,533,565,2012,7,19,19,0
8603,3,0,1,1,33.62,49,11.0014,505,565,2012,7,19,20,1
8604,3,0,1,3,27.06,89,16.9979,332,565,2012,7,19,21,1
8605,3,0,1,3,27.06,89,16.9979,68,565,2012,7,19,22,1


I tried oHe with season but the score was worst

#### Train, test split

In [4]:
y_train = train_df['count']
X_train = train_df.drop('count', axis=1)
y_test = test_df['count']
X_test = test_df.drop('count', axis=1)

In [5]:
# X_train = np.log1p(X_train)
# X_test = np.log1p(X_test)

In [6]:
X_train

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,date,years,months,days,hours,is_night
0,1,0,0,1,9.84,81,0.0000,0,2011,1,1,0,1
1,1,0,0,1,9.02,80,0.0000,0,2011,1,1,1,1
2,1,0,0,1,9.02,80,0.0000,0,2011,1,1,2,1
3,1,0,0,1,9.84,75,0.0000,0,2011,1,1,3,1
4,1,0,0,1,9.84,75,0.0000,0,2011,1,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,3,0,1,1,34.44,44,16.9979,565,2012,7,19,19,0
8603,3,0,1,1,33.62,49,11.0014,565,2012,7,19,20,1
8604,3,0,1,3,27.06,89,16.9979,565,2012,7,19,21,1
8605,3,0,1,3,27.06,89,16.9979,565,2012,7,19,22,1


In [7]:
xgb = XGBRegressor(learning_rate=0.05, n_estimators=300, objective='reg:squarederror')

#### XGBoost Scores

In [8]:
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

train_score = xgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9764233205039001 
Test:  
 - r2 score:  0.9065087276133993 
 - mae:  44.20991021452268 
 - mse:  4439.523226550007 
 - rmse:  66.62974730966647 
 - mape:  0.3029791753786224 



In [10]:
lgb = LGBMRegressor(learning_rate=0.05, n_estimators=300)

In [11]:
lgb.fit(X_train, y_train)
pred = lgb.predict(X_test)

train_score = lgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9728417157606181 
Test:  
 - r2 score:  0.9091820766203951 
 - mae:  44.468155799477834 
 - mse:  4312.5766709383315 
 - rmse:  65.67021144277162 
 - mape:  0.43381342375523774 



In [11]:
import pickle
pickle.dump(xgb, open('xgb_best.pkl', 'wb'))

In [12]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [13]:
space={
        # 'max_depth': hp.quniform('max_depth', 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'learning_rate': hp.quniform('learning_rate', 0.05, 0.3, 0.1),
        'n_estimators': 200
    }

In [14]:
def hyperparameter_tuning(space):
    model=XGBRegressor(n_estimators = space['n_estimators'], 
                        # max_depth = int(space['max_depth']), 
                        gamma = int(space['gamma']), 
                        learning_rate = space['learning_rate'],
                        random_state=0)
    
    
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              early_stopping_rounds=10,
              verbose=False)

    pred = model.predict(X_test)
    r2 = r2_score(y_test, pred)
    mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)
    print ("SCORE:", r2)
    
    #change the metric if you like
    return {'loss': - r2, 'mape': mape, 'status': STATUS_OK, 'model': model}

In [15]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=200,
            trials=trials)

print (best)

SCORE:                                                 
0.8969468726437336                                     
SCORE:                                                                            
0.8873606634293271                                                                
SCORE:                                                                            
0.890978158640229                                                                 
SCORE:                                                                            
0.8909780106250536                                                                
SCORE:                                                                            
0.8969468726437336                                                                
SCORE:                                                                            
0.8909780054314724                                                                
SCORE:                                                    

In [None]:
xgb = XGBRegressor(gamma=4.028722020022547, learning_rate=0.2, n_estimators=300)

xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

train_score = xgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9930190553260337 
Test:  
 - r2 score:  0.8421528258859963 
 - mae:  60.189096222815934 
 - mse:  7495.525280976253 
 - rmse:  86.57670172151543 
 - mape:  0.5490150592944412 



In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

etr = ExtraTreesRegressor()
rf = RandomForestRegressor()

In [None]:
etr.fit(X_train, y_train)
pred = etr.predict(X_test)

train_score = etr.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9999999951847235 
Test:  
 - r2 score:  0.8887005329356544 
 - mae:  47.09854760860026 
 - mse:  5285.162523957876 
 - rmse:  72.69912326815142 
 - mape:  0.33890006174339016 



In [None]:
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

train_score = rf.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9924659890701303 
Test:  
 - r2 score:  0.8780272285705293 
 - mae:  50.7566213251426 
 - mse:  5791.9946744186045 
 - rmse:  76.10515537346078 
 - mape:  0.32509935843831644 



Pretty low learning rate, i tried different number a estimators and 200 was generally the best, alpha at 10 : L1 regularization term on weights,
Gamma at 5 : specifies the minimum loss reduction required to make a split.

In [None]:
# from sklearn.model_selection import learning_curve, StratifiedKFold, cross_val_score
# import numpy as np

# skf = StratifiedKFold(n_splits=5)
# print(f'Cross val score: ', np.mean(cross_val_score(xgb, X_train, y_train, cv=skf)))



In [None]:
# import matplotlib.pyplot as plt


# N, train_score, val_score = learning_curve(xgb, X_train, y_train, train_sizes = np.linspace(0.1,1,50), cv=skf, scoring='r2')
# plt.plot(N, val_score.mean(axis=1), label='validation')
# plt.plot(N, train_score.mean(axis=1), label='train')

# plt.xlabel('train_sizes')
# plt.title('Xgb learning curve')
# plt.legend()
# plt.show() 