#### Data import

In [150]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('../data/cleaned_train.csv')
test_df = pd.read_csv('../data/cleaned_test.csv')

#### Model import
I tried XGBoost and LGBM

In [151]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error

Checking dataframe

In [152]:
train_df

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,count,date,years,months,days,hours,is_night
0,1,0,0,1,9.84,81,0.0000,16,0,2011,1,1,0,0
1,1,0,0,1,9.02,80,0.0000,40,0,2011,1,1,1,0
2,1,0,0,1,9.02,80,0.0000,32,0,2011,1,1,2,0
3,1,0,0,1,9.84,75,0.0000,13,0,2011,1,1,3,0
4,1,0,0,1,9.84,75,0.0000,1,0,2011,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,3,0,1,1,34.44,44,16.9979,533,565,2012,7,19,19,1
8603,3,0,1,1,33.62,49,11.0014,505,565,2012,7,19,20,1
8604,3,0,1,3,27.06,89,16.9979,332,565,2012,7,19,21,1
8605,3,0,1,3,27.06,89,16.9979,68,565,2012,7,19,22,1


#### Train, test split

In [153]:
y_train = train_df['count']
X_train = train_df.drop('count', axis=1)
y_test = test_df['count']
X_test = test_df.drop('count', axis=1)

In [154]:
X_train['windspeed'] = np.log1p(X_train['windspeed'])
X_test['windspeed'] = np.log1p(X_test['windspeed'])

In [155]:
from sklearn.preprocessing import QuantileTransformer

# X_train['wind_class'] = pd.qcut(X_train['windspeed'], 5, labels=False)
# X_test['wind_class'] = pd.qcut(X_test['windspeed'], 5, labels=False)
# X_train['humid_class'] = pd.qcut(X_train['humidity'], 5, labels=False)
# X_test['humid_class'] = pd.qcut(X_test['humidity'], 5, labels=False)

qt = QuantileTransformer(output_distribution='normal', random_state=0)
X_train['wind_class'] = qt.fit_transform(np.array(X_train['windspeed']).reshape(-1, 1))
X_test['wind_class'] = qt.fit_transform(np.array(X_test['windspeed']).reshape(-1, 1))
X_train['humid_class'] = qt.fit_transform(np.array(X_train['humidity']).reshape(-1, 1))
X_test['humid_class'] = qt.fit_transform(np.array(X_test['humidity']).reshape(-1, 1))

In [156]:
X_train

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,date,years,months,days,hours,is_night,wind_class,humid_class
0,1,0,0,1,9.84,81,0.000000,0,2011,1,1,0,0,-5.199338,0.851675
1,1,0,0,1,9.02,80,0.000000,0,2011,1,1,1,0,-5.199338,0.812654
2,1,0,0,1,9.02,80,0.000000,0,2011,1,1,2,0,-5.199338,0.812654
3,1,0,0,1,9.84,75,0.000000,0,2011,1,1,3,0,-5.199338,0.583496
4,1,0,0,1,9.84,75,0.000000,0,2011,1,1,4,0,-5.199338,0.583496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,3,0,1,1,34.44,44,2.890255,565,2012,7,19,19,1,0.539431,-0.754664
8603,3,0,1,1,33.62,49,2.485023,565,2012,7,19,20,1,-0.156199,-0.506348
8604,3,0,1,3,27.06,89,2.890255,565,2012,7,19,21,1,0.539431,1.467860
8605,3,0,1,3,27.06,89,2.890255,565,2012,7,19,22,1,0.539431,1.467860


In [157]:
xgb = XGBRegressor(learning_rate=0.05, n_estimators=200, objective='reg:squarederror')

In [158]:
# from sklearn.preprocessing import MinMaxScaler, SplineTransformer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# all_col = X_train.columns

# # st = SplineTransformer(degree=2, knots=3)
# mms = MinMaxScaler()

# preprocessor = ColumnTransformer([('mms', mms, all_col)])
# pipe = Pipeline(steps=[('pr', preprocessor), ('xgb', xgb)])

#### XGBoost Scores

In [159]:
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

train_score = xgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9710249984390836 
Test:  
 - r2 score:  0.904147149383726 
 - mae:  44.75663529848414 
 - mse:  4551.665046147832 
 - rmse:  67.4660288304257 
 - mape:  0.3355524893141537 



In [160]:
lgb = LGBMRegressor()

In [161]:
lgb.fit(X_train, y_train)
pred = lgb.predict(X_test)

train_score = lgb.score(X_train, y_train)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred, squared=True)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = mean_absolute_percentage_error(y_test, pred)

print(f'Train r2 score: ', train_score, '\n'
      'Test: ', '\n'
      ' - r2 score: ',  r2, '\n'
      ' - mae: ',  mae, '\n'
      ' - mse: ',  mse, '\n'
      ' - rmse: ',  rmse, '\n'
      ' - mape: ',  mape, '\n'
      )

Train r2 score:  0.9665800122106316 
Test:  
 - r2 score:  0.9163446488376521 
 - mae:  43.56281006431541 
 - mse:  3972.4550220547535 
 - rmse:  63.02741484508748 
 - mape:  0.5562470357425673 



In [162]:
import pickle
pickle.dump(xgb, open('xgb_best.pkl', 'wb'))
pickle.dump(lgb, open('lgb_best.pkl', 'wb'))

In [163]:
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [164]:
# space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
#         'gamma': hp.uniform ('gamma', 1,9),
#         'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#         'reg_lambda' : hp.uniform('reg_lambda', 0,1),
#         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#         'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#         'n_estimators': 180,
#         'seed': 0
#     }

In [165]:
# def hyperparameter_tuning(space):
#     model=XGBRegressor(n_estimators = space['n_estimators'], 
#                         # max_depth = int(space['max_depth']), 
#                         gamma = int(space['gamma']), 
#                         learning_rate = space['learning_rate'],
#                         random_state=0)
    
    
#     model.fit(X_train, y_train,
#               eval_set=[(X_train, y_train), (X_test, y_test)], 
#               early_stopping_rounds=10,
#               verbose=False)

#     pred = model.predict(X_test)
#     r2 = r2_score(y_test, pred)
#     mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)
#     print ("SCORE:", r2)
    
#     #change the metric if you like
#     return {'loss': - r2, 'mape': mape, 'status': STATUS_OK, 'model': model}

In [166]:
# trials = Trials()
# best = fmin(fn=hyperparameter_tuning,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=200,
#             trials=trials)

# print (best)

In [167]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

etr = ExtraTreesRegressor()
rf = RandomForestRegressor()

In [168]:
# etr.fit(X_train, y_train)
# pred = etr.predict(X_test)

# train_score = etr.score(X_train, y_train)
# r2 = r2_score(y_test, pred)
# mae = mean_absolute_error(y_test, pred)
# mse = mean_squared_error(y_test, pred, squared=True)
# rmse = mean_squared_error(y_test, pred, squared=False)
# mape = mean_absolute_percentage_error(y_test, pred)

# print(f'Train r2 score: ', train_score, '\n'
#       'Test: ', '\n'
#       ' - r2 score: ',  r2, '\n'
#       ' - mae: ',  mae, '\n'
#       ' - mse: ',  mse, '\n'
#       ' - rmse: ',  rmse, '\n'
#       ' - mape: ',  mape, '\n'
#       )

In [169]:
# rf.fit(X_train, y_train)
# pred = rf.predict(X_test)

# train_score = rf.score(X_train, y_train)
# r2 = r2_score(y_test, pred)
# mae = mean_absolute_error(y_test, pred)
# mse = mean_squared_error(y_test, pred, squared=True)
# rmse = mean_squared_error(y_test, pred, squared=False)
# mape = mean_absolute_percentage_error(y_test, pred)

# print(f'Train r2 score: ', train_score, '\n'
#       'Test: ', '\n'
#       ' - r2 score: ',  r2, '\n'
#       ' - mae: ',  mae, '\n'
#       ' - mse: ',  mse, '\n'
#       ' - rmse: ',  rmse, '\n'
#       ' - mape: ',  mape, '\n'
#       )

Pretty low learning rate, i tried different number a estimators and 200 was generally the best, alpha at 10 : L1 regularization term on weights,
Gamma at 5 : specifies the minimum loss reduction required to make a split.

In [170]:
# from sklearn.model_selection import learning_curve, StratifiedKFold, cross_val_score
# import numpy as np

# skf = StratifiedKFold(n_splits=5)
# print(f'Cross val score: ', np.mean(cross_val_score(xgb, X_train, y_train, cv=skf)))

In [171]:
# import matplotlib.pyplot as plt


# N, train_score, val_score = learning_curve(xgb, X_train, y_train, train_sizes = np.linspace(0.1,1,50), cv=skf, scoring='r2')
# plt.plot(N, val_score.mean(axis=1), label='validation')
# plt.plot(N, train_score.mean(axis=1), label='train')

# plt.xlabel('train_sizes')
# plt.title('Xgb learning curve')
# plt.legend()
# plt.show() 