# UCI BikeDataset

Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. 

Source:

Hadi Fanaee-T

Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto
INESC Porto, Campus da FEUP
Rua Dr. Roberto Frias, 378
4200 - 465 Porto, Portugal

Original Source: http://capitalbikeshare.com/system-data
Weather Information: http://www.freemeteo.com
Holiday Schedule: http://dchr.dc.gov/page/holiday-schedule

In [65]:
#Load the dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import resample
bike_df = pd.read_csv('day.csv')
#print(bike_df.head())
#bike_X = bike_df[['temp','atemp','hum','windspeed']]
bike_X = bike_df[['weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
X_train,X_test,y_train,y_test = train_test_split(bike_X,bike_y,test_size = 0.2,random_state=123)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

lmodel = linear_model.LinearRegression()
lmodel.fit(X_train_scaled,y_train)
print("Linear Model score:",lmodel.score(X_test_scaled,y_test))

#BootStrap
from sklearn.utils import resample
ols = linear_model.LinearRegression()
coefs = []
for i in range(1000):
    x_boot, y_boot = resample(X_train_scaled, y_train)
    ols.fit(x_boot, y_boot)
    coefs.append(ols.coef_[0])
    

coefs = np.array(coefs)
print("The mean of the coefficients are: " + str(np.mean(coefs, 0)))
print("The standard error of the coefficients are: " + str(np.std(coefs, 0)))

#Variance Check
import pprint
print('Shape: (%d, %d)' %bike_X.shape)
print('Variation:')
pprint.pprint(dict(zip(['weekday','temp','hum','windspeed'], np.var(X_train_scaled, 0))))

#selectKBest
import sklearn.feature_selection as fs
best1 = fs.SelectKBest(fs.f_regression, k=3).fit_transform(bike_X, bike_y)
print(best1)
print(bike_X.head(2))

pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(X_train, y_train.values.ravel())

pred = clf.predict(X_test)
print(r2_score(y_test, pred))
print(mean_squared_error(y_test, pred))

Linear Model score: 0.25346763326
The mean of the coefficients are: [  142.31441692  1270.58329498  -432.94890326  -428.32754376]
The standard error of the coefficients are: [ 60.00880358  56.73398608  66.27205789  58.24249268]
Shape: (731, 4)
Variation:
{'hum': 1.0, 'temp': 0.99999999999999978, 'weekday': 1.0, 'windspeed': 1.0}
[[ 0.344167  0.805833  0.160446]
 [ 0.363478  0.696087  0.248539]
 [ 0.196364  0.437273  0.248309]
 ..., 
 [ 0.253333  0.752917  0.124383]
 [ 0.255833  0.483333  0.350754]
 [ 0.215833  0.5775    0.154846]]
   weekday      temp       hum  windspeed
0        6  0.344167  0.805833   0.160446
1        0  0.363478  0.696087   0.248539


  y = column_or_1d(y, warn=True)


0.437314143493
1920112.1028


In [7]:
#Just use Linear Regression on the entire dataset
# Features Used'season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed' 
import pandas as pd
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
lmodel = linear_model.LinearRegression()
#fitting the entire dataset
lmodel.fit(bike_X,bike_y)
print("Coefficients",lmodel.coef_)
print("Score",lmodel.score(bike_X,bike_y))
print("Intercept",lmodel.intercept_)

Coefficients [[  409.07908645  -514.63202997    29.22325226   112.79888456  -501.7800154
     29.22325226  5600.92745819 -2226.9877399  -3309.9013916 ]]
Score 0.525637463941
Intercept [ 3199.84781977]


In [3]:
#Linear Regression with a normalize set to True
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
lmodel = linear_model.LinearRegression(normalize=True)
lmodel.fit(bike_X,bike_y)
print("Coefficients",lmodel.coef_)
print("Score",lmodel.score(bike_X,bike_y))
print("Intercept",lmodel.intercept_)

Coefficients [[  4.08499267e+02  -5.15471604e+02  -2.17631740e+16   1.10777665e+02
   -5.20597152e+02   2.17631740e+16   5.58875891e+03  -2.15651630e+03
   -3.28063329e+03]]
Score 0.525108312179
Intercept [ 3234.74187119]


In [12]:
#Linear Regression with train test split
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
lmodel = linear_model.LinearRegression()
X_train,X_test,y_train,y_test = train_test_split(bike_X,bike_y,test_size = 0.33,random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

lmodel = linear_model.LinearRegression()
lmodel.fit(X_train_scaled,y_train)
print("Coefficients",lmodel.coef_)
print("Score",lmodel.score(X_test_scaled,y_test))
print("Intercept",lmodel.intercept_)

Coefficients [[  484.6497664   -104.21366915    40.54261733    18.88793038
   -133.43921633    40.54261733  1016.09543448  -385.71696106
   -339.15851857]]
Score 0.490797346444
Intercept [ 4508.80368098]


In [19]:
#Cross Validation Score
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection as ms
lmodel = linear_model.LinearRegression()
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
scores = ms.cross_val_score(estimator=lmodel, X=bike_X, y=bike_y,cv=100)

print('Scores: {}'.format(scores))
print('Score mean: {}'.format(scores.mean()))
print('Score std deviation: {}'.format(scores.std()))


Scores: [-18.02130518 -42.56843178  -3.8840055   -5.36171028 -18.26445499
 -20.0016797   -5.2278593  -12.53661785  -4.13083161  -3.88112967
 -15.93690418  -2.00233862  -1.89383714   0.1966595   -0.34919928
   0.7882722   -0.78118193   0.67995471  -8.88690346  -7.56028039
  -3.74086913  -7.08084566 -17.42488714  -5.52936384  -6.853751
 -37.62558403 -43.1653186  -28.43929519  -9.55721581  -0.69413986
  -2.45493091  -0.17935727  -2.49579596  -0.57098519  -0.35786104
  -2.77182844  -0.45803794  -0.48656548   0.23654649 -23.94441299
  -2.02510945  -1.6862533   -5.694404    -6.3403372   -1.05245032
  -2.44035334  -0.69715595  -1.48496996   0.75447907   0.09460869
   0.07576926  -5.06694128  -0.3042306    0.40507712  -0.35765624
  -0.11554943   0.5542764   -6.47675308  -5.290414    -1.32294941
 -27.40759062  -6.6334667   -3.48022359   0.47823174  -3.0325679
 -35.83558105  -0.2554945   -1.04571285  -5.55675661  -1.25095377
 -19.36310274  -1.86708057  -1.59205666  -1.42245295  -1.95761405
  -1.

In [24]:
#BootStrap
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection as ms
lmodel = linear_model.LinearRegression()
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
from sklearn import linear_model
ols = linear_model.LinearRegression()

from sklearn.utils import resample
coefs = []
for i in range(1000):
    x_boot, y_boot = resample(bike_X,bike_y)
    ols.fit(x_boot, y_boot)
    coefs.append(ols.coef_[0])

coefs = np.array(coefs)
print("The mean of the coefficients are: " + str(np.mean(coefs, 0)))
print("The standard error of the coefficients are: " + str(np.std(coefs, 0)))
ols.fit(bike_X,bike_y)
print("Coefficients with Entire dataset: ", ols.coef_)
print("Bootstrap estimates: ", np.mean(coefs, 0))
print("score",ols.score(bike_X,bike_y))

The mean of the coefficients are: [  4.12800166e+02  -5.06731165e+02  -1.38199773e+14   1.13280398e+02
  -4.88509042e+02   1.38199773e+14   5.59438378e+03  -2.27831639e+03
  -3.30813659e+03]
The standard error of the coefficients are: [  4.73959858e+01   3.10650384e+02   2.36681141e+16   1.16369606e+02
   1.28965378e+02   2.36681141e+16   2.96899970e+02   5.37815113e+02
   6.90508727e+02]
Coefficients with Entire dataset:  [[  409.07908645  -514.63202997    29.22325226   112.79888456  -501.7800154
     29.22325226  5600.92745819 -2226.9877399  -3309.9013916 ]]
Bootstrap estimates:  [  4.12800166e+02  -5.06731165e+02  -1.38199773e+14   1.13280398e+02
  -4.88509042e+02   1.38199773e+14   5.59438378e+03  -2.27831639e+03
  -3.30813659e+03]
score 0.525637463941


In [25]:
#Feature Selection
#Variance Calculation
import pandas as pd
import numpy as np
import pprint
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
print('Shape: (%d, %d)' %bike_X.shape)
print('Variation:')
pprint.pprint(dict(zip(['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed'], np.var(bike_X, 0))))

Shape: (731, 9)
Variation:
{'holiday': 0.027902485398447901,
 'hum': 0.020258296051448844,
 'season': 1.2322044460580086,
 'temp': 0.033461829055414519,
 'weathersit': 0.2965036744822282,
 'weekday': 4.0136724049846553,
 'windspeed': 0.0059977039274154778,
 'workingday': 0.21614601364994582}


In [32]:
#Feature Selection
#chi2 
import pandas as pd
import numpy as np
import sklearn.feature_selection as fs
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
best6 = fs.SelectKBest(fs.chi2, k=6).fit_transform(bike_X,bike_y)
print(best6.shape)
lmodel = linear_model.LinearRegression()
lmodel.fit(best6,bike_y)
print("chi2 6 model score",lmodel.score(best6,bike_y))

(731, 6)
chi2 6 model score 0.271998141307


In [45]:
#Feature Selection 
#f_regression
import pandas as pd
import numpy as np
import sklearn.feature_selection as fs
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
best8 = fs.SelectKBest(fs.f_regression, k=8).fit_transform(bike_X,bike_y.values.ravel())
print(best8.shape)
lmodel = linear_model.LinearRegression()
lmodel.fit(best8,bike_y)
print("f_regression 7 model score",lmodel.score(best8,bike_y))

(731, 8)
f_regression 7 model score 0.524955563803


In [52]:
#Regularization
#Ridge
import pandas as pd
import numpy as np
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
ridge = linear_model.Ridge(alpha=.001)
ridge.fit(bike_X,bike_y)
#print coefficients
print("Coefficients:",ridge.coef_) 
print("Intercept:",ridge.intercept_)
print("The determination of ridge regression is: %.4f" %ridge.score(bike_X,bike_y))

Coefficients: [[  409.09723196  -514.60958458    29.22440423   112.81135374
   -501.87625185    29.22440422  5600.63976424 -2226.46956137
  -3308.93079923]]
Intercept: [ 3199.55299584]
The determination of ridge regression is: 0.5256


In [53]:
#Regularization
#Lasso
import pandas as pd
import numpy as np
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
lasso = linear_model.Lasso(alpha=.001)
lasso.fit(bike_X,bike_y)
#print coefficients
print("Coefficients:",lasso.coef_) 
print("Intercept:",lasso.intercept_)
print("The determination of Lasso regression is: %.4f" %lasso.score(bike_X,bike_y))

Coefficients: [  4.09080740e+02  -5.14597279e+02   5.51439958e+01   1.12799375e+02
  -5.01800213e+02   3.30310208e+00   5.60088946e+03  -2.22685772e+03
  -3.30967905e+03]
Intercept: [ 3199.76359248]
The determination of ridge regression is: 0.5256


In [55]:
#Regularization
#ElasticNet
import pandas as pd
import numpy as np
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
elastic = linear_model.ElasticNet(alpha = 1, l1_ratio = 0.5)
elastic.fit(bike_X,bike_y)
#print coefficients
print("Coefficients:",elastic.coef_) 
print("Intercept:",elastic.intercept_)
print("The determination of ElasticNet regression is: %.4f" %elastic.score(bike_X,bike_y))

Coefficients: [ 493.03314207  -34.00502298   32.23960262   74.29812715 -396.35529532
   31.95814664  343.37653494  -49.05760396  -46.93853084]
Intercept: [ 3453.88642332]
The determination of ElasticNet regression is: 0.2460


In [59]:
#GridSearchCV
#Regularization
#ElasticNet
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
knn = neighbors.KNeighborsClassifier()
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
model = linear_model.Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(bike_X,bike_y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.score(bike_X,bike_y))

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
-0.669449305887
0.525637463941


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.134829142128534e-35
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 4.5257423090005406e-17
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 6.239990141881245e-36
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.446264090564664e-17


In [61]:
#PCA
from sklearn.decomposition import PCA
pca = PCA()
import pandas as pd
import numpy as np
from sklearn import linear_model
bike_df = pd.read_csv('day.csv')
bike_X = bike_df[['season','holiday','weekday','workingday','weathersit','weekday','temp','hum','windspeed']]
bike_y = bike_df[['cnt']]
pca.set_params(n_components=8)
pca.fit(bike_X)
pca.explained_variance_ratio_
bike_X2= pca.transform(bike_X)
lmodel = linear_model.LinearRegression()
lmodel.fit(bike_X2,bike_y)
print("PCA Score",lmodel.score(bike_X2,bike_y))


PCA Score 0.525637463941
