## Train a model with bike rental data using XGBoost algorithm
### Training log1p(count) dataset
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
# !pip install xgboost==0.90

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# XGBoost 
import xgboost as xgb

import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [2]:
print(xgb.__version__)

1.3.3


<h2>Kaggle Bike Sharing Demand Dataset</h2>

Modified 'count' to log1p(count) for training

Log can be used when target represents a count (that is non-negative values)

Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)

Reference:
https://www.kaggle.com/apapiu/predicting-bike-sharing-with-xgboost by Alexandru Papiu

To download dataset, sign-in and download from this link:
https://www.kaggle.com/c/bike-sharing-demand/data <br>


Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']<br>
Target Feature: [<b>log1p('count')</b>]<br>
Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>

In [3]:
column_list_file = '../Data/bike_train_column_list.txt'
train_file = '../Data/bike_train.csv'
validation_file = '../Data/bike_validation.csv'
test_file = '../Data/bike_test.csv'

In [4]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [5]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour']

In [6]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [7]:
df_train.head()

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,4.477337,3,0,0,2,26.24,30.305,73,7.0015,2011,9,3,5,0
1,5.517453,3,0,1,1,32.8,34.85,33,7.0015,2012,8,13,0,14
2,5.814131,4,0,0,1,15.58,19.695,40,11.0014,2011,11,5,5,17
3,6.43615,3,0,1,1,32.8,37.88,55,12.998,2012,8,9,3,19
4,4.26268,2,0,1,1,13.94,17.425,76,7.0015,2011,4,14,3,6


In [8]:
df_validation.head()

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,6.095825,3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3,8
1,5.961005,2,0,0,1,32.8,37.88,55,12.998,2011,6,11,5,13
2,1.098612,1,0,1,1,14.76,16.665,40,19.9995,2011,2,14,0,2
3,3.89182,1,0,1,1,9.02,9.09,47,36.9974,2011,2,8,1,10
4,4.025352,4,0,0,1,10.66,15.15,87,0.0,2011,12,4,6,8


In [9]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [10]:
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)
regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)

In [11]:
regressor

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=5,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=150, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [12]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

[0]	validation_0-rmse:3.06460	validation_1-rmse:3.07128
[1]	validation_0-rmse:2.18561	validation_1-rmse:2.18977
[2]	validation_0-rmse:1.57973	validation_1-rmse:1.58729
[3]	validation_0-rmse:1.15916	validation_1-rmse:1.16584
[4]	validation_0-rmse:0.86291	validation_1-rmse:0.87348
[5]	validation_0-rmse:0.67593	validation_1-rmse:0.68940
[6]	validation_0-rmse:0.55710	validation_1-rmse:0.57295
[7]	validation_0-rmse:0.47210	validation_1-rmse:0.49133
[8]	validation_0-rmse:0.41153	validation_1-rmse:0.43610
[9]	validation_0-rmse:0.37590	validation_1-rmse:0.40111
[10]	validation_0-rmse:0.34717	validation_1-rmse:0.37443
[11]	validation_0-rmse:0.33554	validation_1-rmse:0.36382
[12]	validation_0-rmse:0.32311	validation_1-rmse:0.35253
[13]	validation_0-rmse:0.31561	validation_1-rmse:0.34656
[14]	validation_0-rmse:0.31065	validation_1-rmse:0.34307
[15]	validation_0-rmse:0.30379	validation_1-rmse:0.33729
[16]	validation_0-rmse:0.30120	validation_1-rmse:0.33574
[17]	validation_0-rmse:0.29738	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=150, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
df_train['count'].describe()

In [None]:
eval_result = regressor.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
xgb.plot_importance(regressor)
plt.show()

In [None]:
# Updated - Changed to validation dataset
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)

In [None]:
df.head()

In [None]:
X_test = df.iloc[:,1:]
print(X_test[:5])

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
df.head()

In [None]:
df['count_predicted'] = result

In [None]:
df.head()

In [None]:
# Negative Values are predicted
df['count_predicted'].describe()

In [None]:
df[df['count_predicted'] < 0]

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
df['count_predicted'] = df['count_predicted'].map(adjust_count)

In [None]:
df[df['count_predicted'] < 0]

In [None]:
df['count'] = df['count'].map(np.expm1)
df['count_predicted'] = df['count_predicted'].map(np.expm1)

In [None]:
# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0:.2f}".format(metrics.mean_squared_error(df['count'],
                                                    df['count_predicted'])**.5))

In [None]:
# Metric Use By Kaggle
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

In [None]:
print("RMSLE: {0:.2f}".format(compute_rmsle(df['count'],df['count_predicted'])))

In [None]:
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
df_test.head()

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head()

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
np.expm1(result)

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('../Data/predicted_count.csv',index=False)