In [1]:
#importing libraries 

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error 
from sklearn import linear_model 

In [2]:
#Loading the Data Set

filePath = '/cxldata/datasets/project/bikes.csv'
bikesData = pd.read_csv(filePath)

In [3]:
bikesData.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
#Cleaning the data

columnsToDrop=['instant','casual','registered','atemp','dteday']

bikesData = bikesData.drop(columnsToDrop,axis=1)

In [5]:
# Feature Scaling the Data

columnsToScale=['temp', 'hum', 'windspeed']
# Feature Scaling the Data

columnsToScale=['temp', 'hum', 'windspeed']

#instance of Standard Scalar
scaler = StandardScaler()

bikesData[columnsToScale] = scaler.fit_transform(bikesData[columnsToScale])
bikesData[columnsToScale].describe()

#Adding a new fetaure'DayCount' to DataSet 
bikesData['dayCount']=pd.Series(range(bikesData.shape[0]))/24

In [6]:
#Scaled Data
bikesData.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,dayCount
0,1,0,1,0,0,6,0,1,-1.334648,0.947372,-1.553889,16,0.0
1,1,0,1,1,0,6,0,1,-1.438516,0.895539,-1.553889,40,0.041667
2,1,0,1,2,0,6,0,1,-1.438516,0.895539,-1.553889,32,0.083333
3,1,0,1,3,0,6,0,1,-1.334648,0.63637,-1.553889,13,0.125
4,1,0,1,4,0,6,0,1,-1.334648,0.63637,-1.553889,1,0.166667


In [7]:
#Splitting the data into training and tese set by stratified sampling
from sklearn.model_selection import train_test_split
np.random.seed(42)

train_set,test_set = train_test_split(bikesData,test_size = 0.30,random_state=42)


In [8]:
#Sorting the values based on DayCount
train_set.sort_values("dayCount",axis=0,inplace=True)
test_set.sort_values("dayCount",axis=0,inplace=True)

#printing the number of instances in train_set and test_set
len(train_set)
len(test_set)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


5214

In [9]:
#Defining Utility Functions
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [10]:
trainingCols = train_set.drop(['cnt'],axis=1)
trainingLabels=train_set['cnt']

In [11]:
#importing models from sklearn
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

#importing XGBoost
from xgboost import XGBRegressor

In [12]:
#Training Decision Tree Model
dec_reg=DecisionTreeRegressor(random_state=42)
dt_mae_scores= -cross_val_score(dec_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_absolute_error")
display_scores(dt_mae_scores)



Scores: [42.79622021 50.31963846 37.03615448 44.15365653 47.07723911 72.03782895
 58.36019737 48.22615132 50.94407895 96.88898026]
Mean: 54.78401456342171
Standard deviation: 16.70953413912157


In [13]:
dt_mse_scores=np.sqrt(-cross_val_score(dec_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_squared_error"))
display_scores(dt_mse_scores)

Scores: [ 65.10279464  77.670527    60.59989749  73.66268699  75.76702279
 113.26870416  96.665763    81.28550338  86.87354694 149.4085296 ]
Mean: 88.03049759858746
Standard deviation: 25.035783082114


In [14]:
#Training Random Forest Model
forest_reg = RandomForestRegressor(n_estimators=150,random_state=42)
rf_mae_scores= -cross_val_score(forest_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_absolute_error")
display_scores(rf_mae_scores)

Scores: [33.33921665 33.5634292  28.51544782 31.74855656 36.55779239 57.78830592
 40.91410636 40.72524123 37.58651864 84.75860197]
Mean: 42.54972167412677
Standard deviation: 15.998616999257116


In [15]:
#Training and Checking error using MSE 
rf_mse_error=np.sqrt(-cross_val_score(forest_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_squared_error"))
display_scores(rf_mse_error)

Scores: [ 45.56103408  50.97710002  43.43059587  52.22561489  60.45598311
  94.23444802  66.14917355  65.25550321  61.67555223 132.00386791]
Mean: 67.19688728966834
Standard deviation: 25.55764715717203


In [17]:
#Training Linear Regression Model
lin_reg=LinearRegression()
lr_mae_scores= -cross_val_score(lin_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_absolute_error")
display_scores(lr_mae_scores)


Scores: [ 66.96340699  80.48809095 113.84704981  93.17230086  76.11197672
  96.5220689  133.13798218 158.02254734 158.90195479 127.15674717]
Mean: 110.43241256942201
Standard deviation: 31.426965705295725


In [18]:
lr_mse_scores = np.sqrt(-cross_val_score(lin_reg,trainingCols,trainingLabels,cv=10,scoring="neg_mean_squared_error"))
display_scores(lr_mse_scores)

Scores: [ 84.63836676 111.12038541 131.88324414 119.16350622 105.17621319
 127.72562924 174.97188817 187.31691741 205.60028279 164.30585678]
Mean: 141.1902290118175
Standard deviation: 37.55565075919573


LinearRegresion()
Mean: 141.1902290118175
Standard deviation: 37.55565075919573
    
RandomForestRegressor()
Mean: 67.19688728966834
Standard deviation: 25.557647157172
    
DecisionTreeRegressor()
Mean: 88.03049759858746
Standard deviation: 25.035783082114 


In [21]:
#Fine Tuning The Model using Best HyperParameters using GridSearchCV 
from sklearn.model_selection import GridSearchCV

In [22]:
param_grid = [ 
    {
        'n_estimators': [120,150],
        'max_features' : [10,12],
        'max_depth' : [15,28]
    }
]

In [23]:
grid_search = GridSearchCV(forest_reg,param_grid,cv=5,scoring = "neg_mean_squared_error")

In [24]:
grid_search.fit(trainingCols,trainingLabels)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=150, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [25]:
#Printing  the 'best estimator' and 'best hyperparameters'
print(grid_search.best_estimator_)
print(grid_search.best_params_)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=28, max_features=10, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
{'max_depth': 28, 'max_features': 10, 'n_estimators': 150}


In [28]:
#Knowing Feature Importances
feature_importances=grid_search.best_estimator_.feature_importances_

In [27]:
print(feature_importances)

[0.00424888 0.00145493 0.00570279 0.58348648 0.00215107 0.01790669
 0.06993018 0.01688336 0.09373438 0.03176755 0.00907719 0.16365649]


In [30]:
#Preparing to test the final model on Test dataset

