In [1]:
from data_preprocessing import decompress_pickle, compressed_pickle
import pandas as pd
import os

# load data
df = decompress_pickle("../data/preprocessed/BikeRental_complete.pbz2")

from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Bike Share Rental Pandas Profiling Report', explorative = True, dark_mode = True)
profile.to_file(output_file='Bike Share Rental Pandas Profiling Report.html')

In [2]:
df.head()

Unnamed: 0,datetime,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01 00:00:00,2011-01-01,1.0,0.0,1,0,0.0,5,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,2011-01-01,1.0,0.0,1,1,0.0,5,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,2011-01-01,1.0,0.0,1,2,0.0,5,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,2011-01-01,1.0,0.0,1,3,0.0,5,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,2011-01-01,1.0,0.0,1,4,0.0,5,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0


In [4]:
X = df.drop('cnt', axis = 1)
X = X.drop('datetime', axis = 1)
X = X.drop('dteday', axis = 1)
X = X.drop('registered', axis = 1)
X = X.drop('casual', axis=1)
#X = X.drop('yr', axis=1)
Y = df['cnt']


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0)
print(Y_train.shape)
print(Y_test.shape)

Y_train_mean = Y_train.mean()
print("Y_train_mean =", Y_train_mean)
Y_train_meandev = sum((Y_train-Y_train_mean)**2)
print("Y_train_meandev =", Y_train_meandev)
Y_test_meandev = sum((Y_test-Y_train_mean)**2)
print("Y_test_meandev =", Y_test_meandev)

(14035,)
(3509,)
Y_train_mean = 187.66539347058855
Y_train_meandev = 459541132.40605336
Y_test_meandev = 116954039.11552408


In [8]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit()

for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

Y_train_mean = Y_train.mean()
print("Y_train_mean =", Y_train_mean)
Y_train_meandev = sum((Y_train-Y_train_mean)**2)
print("Y_train_meandev =", Y_train_meandev)
Y_test_meandev = sum((Y_test-Y_train_mean)**2)
print("Y_test_meandev =", Y_test_meandev)


Y_train_mean = 177.86125170998633
Y_train_meandev = 429659156.4202601
Y_test_meandev = 148598422.69268876


In [11]:
# create report dataframe
report = pd.DataFrame(columns=['Model','R2.Train','R2.Test'])

In [12]:
################
#     OLS      #
################

from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, Y_train)
Y_train_pred = lm.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = lm.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)

R2 = 0.4014026119945089
Pseudo-R2 = 0.3564842588430778


In [13]:
# OLS with Cross Validation and Grid Search
from sklearn.linear_model import LinearRegression
lmCV = LinearRegression()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'fit_intercept':[True,False]
}
CV_olsmodel = GridSearchCV(estimator=lmCV, param_grid=param_grid, cv=10)
CV_olsmodel.fit(X_train, Y_train)
print(CV_olsmodel.best_params_)
lmCV = lmCV.set_params(**CV_olsmodel.best_params_)
lmCV.fit(X_train, Y_train)
Y_train_pred = lmCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = lmCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['OLS RegressionCV', r2, pseudor2]

{'fit_intercept': False}
R2 = 0.40138392312863835
Pseudo-R2 = 0.35610452597060305


In [14]:
####################
# Ridge Regression #
####################

from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=2)
ridgereg.fit(X_train, Y_train)
Y_train_pred = ridgereg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = ridgereg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)


R2 = 0.4014007828305891
Pseudo-R2 = 0.35644490880869306


In [15]:
# find best lambda (alphas)
from sklearn.linear_model import Ridge
ridgeregCV = Ridge()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01]
}
CV_rrmodel = GridSearchCV(estimator=ridgeregCV, param_grid=param_grid, cv=10)
CV_rrmodel.fit(X_train, Y_train)
print(CV_rrmodel.best_params_)
ridgeregCV = ridgeregCV.set_params(**CV_rrmodel.best_params_)
ridgeregCV.fit(X_train, Y_train)
Y_train_pred = ridgeregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = ridgeregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Ridge RegressionCV', r2, pseudor2]

{'alpha': 25}
R2 = 0.401232188280927
Pseudo-R2 = 0.35494181456372287


In [16]:
#############################
# Support Vector Regression #
#############################

# linear kernel
from sklearn.svm import SVR
LinSVRreg = SVR(kernel='linear', C=1.0, epsilon=0.1)
LinSVRreg.fit(X_train, Y_train)
Y_train_pred = LinSVRreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = LinSVRreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Support Vector RegressionCV', r2, pseudor2]

R2 = 0.34406280877315265
Pseudo-R2 = 0.25487956438543224


In [17]:
# radial kernel
RbfSVRreg = SVR(kernel='rbf', C=1.0, epsilon=0.1)
RbfSVRreg.fit(X_train, Y_train)
Y_train_pred = RbfSVRreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RbfSVRreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Support Vector RegressionCV', r2, pseudor2]

R2 = 0.38574885793733726
Pseudo-R2 = 0.28329777895635433


In [None]:
from sklearn.svm import SVR
RbfSVRregCV = SVR()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'kernel': ["linear", "rbf"], 
    'C': [1, 3, 5, 8, 10],
    'epsilon': [0.0, 0.025, 0.05, 0.075, 0.1],
    'gamma' : [0., 1., 2., 3., 4.]
}
CV_svrmodel = GridSearchCV(estimator=RbfSVRregCV, param_grid=param_grid, cv=10)
CV_svrmodel.fit(X_train, Y_train)
print(CV_svrmodel.best_params_)
RbfSVRregCV = RbfSVRregCV.set_params(**CV_svrmodel.best_params_)
RbfSVRregCV.fit(X_train, Y_train)
Y_train_pred = RbfSVRregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RbfSVRregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Support Vector RegressionCV', r2, pseudor2]

In [18]:
##################
# Neural Network #
##################

from sklearn.neural_network import MLPRegressor
NNetRreg = MLPRegressor(solver='lbfgs', max_iter=10000, hidden_layer_sizes=(10,), random_state=0)
NNetRreg.fit(X_train, Y_train)
Y_train_pred = NNetRreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = NNetRreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Neural NetworkCV', r2, pseudor2]

R2 = 0.5882970964463183
Pseudo-R2 = 0.5589824018612105


In [None]:
from sklearn.neural_network import MLPRegressor
NNetRregCV = MLPRegressor(solver='lbfgs', max_iter=10000, random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'learning_rate': ["constant", "invscaling", "adaptive"],
    'hidden_layer_sizes': [(5,), (8,), (10,), (13,)],
    'alpha': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.1],
    'activation': ["logistic", "relu", "tanh"]
}
CV_nnmodel = GridSearchCV(estimator=NNetRregCV, param_grid=param_grid, cv=10)
CV_nnmodel.fit(X_train, Y_train)
print(CV_nnmodel.best_params_)
NNetRregCV = NNetRregCV.set_params(**CV_nnmodel.best_params_)
NNetRregCV.fit(X_train, Y_train)
Y_train_pred = NNetRregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = NNetRregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Neural NetworkCV', r2, pseudor2]

In [19]:
#################
# Random Forest #
#################

from sklearn.ensemble import RandomForestRegressor
RForreg = RandomForestRegressor(n_estimators=500, random_state=0)
RForreg.fit(X_train, Y_train)
Y_train_pred = RForreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RForreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Random ForestCV', r2, pseudor2]

R2 = 0.9931131619571687
Pseudo-R2 = 0.8898634544501338


In [None]:
from sklearn.ensemble import RandomForestRegressor
RForregCV = RandomForestRegressor(random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'max_depth': [ 4.,  5.,  6.,  7.,  8.],
    'n_estimators': [ 10,  50,  100, 150, 200]
}
CV_rfmodel = GridSearchCV(estimator=RForregCV, param_grid=param_grid, cv=10)
CV_rfmodel.fit(X_train, Y_train)
print(CV_rfmodel.best_params_)
RForregCV = RForregCV.set_params(**CV_rfmodel.best_params_)
RForregCV.fit(X_train, Y_train)
Y_train_pred = RForregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RForregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Random ForestCV', r2, pseudor2]


In [20]:
#####################
# Gradient Boosting #
#####################

from sklearn.ensemble import GradientBoostingRegressor
GBoostreg = GradientBoostingRegressor(random_state=0)
GBoostreg.fit(X_train, Y_train)
Y_train_pred = GBoostreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = GBoostreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Gradient BoostingCV', r2, pseudor2]

R2 = 0.8821010671844156
Pseudo-R2 = 0.7904120645466031


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBoostregCV = GradientBoostingRegressor(random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'max_depth': [ 3., 4., 5.],
    'subsample': [0.7, 0.8, 0.9],
    'n_estimators': [50, 100,150],
    'learning_rate': [0.1, 0.2, 0.3]
}
CV_gbmodel = GridSearchCV(estimator=GBoostregCV, param_grid=param_grid, cv=10)
CV_gbmodel.fit(X_train, Y_train)
print(CV_gbmodel.best_params_)
GBoostregCV = GBoostregCV.set_params(**CV_gbmodel.best_params_)
GBoostregCV.fit(X_train, Y_train)
Y_train_pred = GBoostregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = GBoostregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Gradient BoostingCV', r2, pseudor2]

In [21]:
################
# Final Report #
################

print(report)

                         Model  R2.Train   R2.Test
0             OLS RegressionCV  0.401384  0.356105
1           Ridge RegressionCV  0.401232  0.354942
2  Support Vector RegressionCV  0.344063  0.254880
3  Support Vector RegressionCV  0.385749  0.283298
4             Neural NetworkCV  0.588297  0.558982
5              Random ForestCV  0.993113  0.889863
6          Gradient BoostingCV  0.882101  0.790412


In [6]:
from data_preprocessing import decompress_pickle, compressed_pickle
import pandas as pd
df1 = decompress_pickle("../data/interim/ArtificalRentals13.pbz2")

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2555541 entries, 0 to 583665
Data columns (total 9 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Duration              int64         
 1   Start date            datetime64[ns]
 2   End date              datetime64[ns]
 3   Start station number  int64         
 4   Start station         object        
 5   End station number    int64         
 6   End station           object        
 7   Bike number           object        
 8   Member type           object        
dtypes: datetime64[ns](2), int64(3), object(4)
memory usage: 195.0+ MB


In [11]:
df1[13000:13050]

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
13000,1171,2013-01-04 19:27:18,2013-01-04 19:46:50,31220,US Dept of State / Virginia Ave & 21st St NW,31109,7th & T St NW,W00253,Member
13001,2524,2013-01-04 19:27:20,2013-01-04 20:09:24,31400,Georgia & New Hampshire Ave NW,31610,Eastern Market / 7th & North Carolina Ave SE,W00406,Casual
13002,321,2013-01-04 19:27:32,2013-01-04 19:32:54,31615,6th & H St NE,31603,1st & M St NE,W01231,Member
13003,830,2013-01-04 19:27:33,2013-01-04 19:41:23,31116,California St & Florida Ave NW,31113,Columbia Rd & Belmont St NW,W01321,Member
13004,471,2013-01-04 19:27:43,2013-01-04 19:35:34,31037,Ballston Metro / N Stuart & 9th St N,31026,Washington Blvd & 10th St N,W20959,Member
13005,404,2013-01-04 19:27:48,2013-01-04 19:34:33,31214,17th & Corcoran St NW,31201,15th & P St NW,W00903,Member
13006,373,2013-01-04 19:28:01,2013-01-04 19:34:15,31214,17th & Corcoran St NW,31201,15th & P St NW,W01032,Member
13007,384,2013-01-04 19:28:08,2013-01-04 19:34:32,31619,Lincoln Park / 13th & East Capitol St NE,31613,Eastern Market Metro / Pennsylvania Ave & 7th ...,W01453,Member
13008,566,2013-01-04 19:28:11,2013-01-04 19:37:38,31107,Lamont & Mt Pleasant NW,31214,17th & Corcoran St NW,W00438,Member
13009,365,2013-01-04 19:28:16,2013-01-04 19:34:22,31619,Lincoln Park / 13th & East Capitol St NE,31613,Eastern Market Metro / Pennsylvania Ave & 7th ...,W00186,Member


In [2]:
from data_preprocessing import decompress_pickle, compressed_pickle
df5 = decompress_pickle("../data/preprocessed/BikeRental_complete.pbz2")

In [3]:
df5.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17544 entries, 0 to 17543
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    17544 non-null  datetime64[ns]
 1   dteday      17544 non-null  object        
 2   season      17544 non-null  float64       
 3   yr          17544 non-null  float64       
 4   mnth        17544 non-null  int64         
 5   hr          17544 non-null  int64         
 6   holiday     17544 non-null  float64       
 7   weekday     17544 non-null  int64         
 8   workingday  17544 non-null  float64       
 9   weathersit  17544 non-null  float64       
 10  temp        17544 non-null  float64       
 11  atemp       17544 non-null  float64       
 12  hum         17544 non-null  float64       
 13  windspeed   17544 non-null  float64       
 14  casual      17544 non-null  float64       
 15  registered  17544 non-null  float64       
 16  cnt         17544 non-

In [4]:
df5['cnt'].sort_values()

389        1.0
9244       1.0
1924       1.0
1922       1.0
8187       1.0
         ...  
15209    967.0
14850    968.0
14873    970.0
15089    976.0
14898    977.0
Name: cnt, Length: 17544, dtype: float64