In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, Imputer, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv("hour.csv")
data.head()


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:

data = data[['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']]
data.head()
data.tail(n=10)



Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
17369,1,12,14,0,1,1,2,0.28,0.2727,0.45,0.2239,62,185,247
17370,1,12,15,0,1,1,2,0.28,0.2879,0.45,0.1343,69,246,315
17371,1,12,16,0,1,1,2,0.26,0.2576,0.48,0.194,30,184,214
17372,1,12,17,0,1,1,2,0.26,0.2879,0.48,0.0896,14,150,164
17373,1,12,18,0,1,1,2,0.26,0.2727,0.48,0.1343,10,112,122
17374,1,12,19,0,1,1,2,0.26,0.2576,0.6,0.1642,11,108,119
17375,1,12,20,0,1,1,2,0.26,0.2576,0.6,0.1642,8,81,89
17376,1,12,21,0,1,1,1,0.26,0.2576,0.6,0.1642,7,83,90
17377,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61
17378,1,12,23,0,1,1,1,0.26,0.2727,0.65,0.1343,12,37,49


In [4]:
hour_dummy_data = data[['season', 'mnth', 'hr', 'weekday', 'weathersit']].astype(str)
hour_data_dummies = pd.get_dummies(hour_dummy_data)
data1 = pd.concat([data[['holiday','workingday','temp','atemp','hum','windspeed']], hour_data_dummies], axis = 1)
data1.head()

Unnamed: 0,holiday,workingday,temp,atemp,hum,windspeed,season_1,season_2,season_3,season_4,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,0,0,0.24,0.2879,0.81,0.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,0,0,0.22,0.2727,0.8,0.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,0,0,0.22,0.2727,0.8,0.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,0,0,0.24,0.2879,0.75,0.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,0.24,0.2879,0.75,0.0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [5]:
hour_target = data[['casual', 'registered', 'cnt']]
hour_target.head()

Unnamed: 0,casual,registered,cnt
0,3,13,16
1,8,32,40
2,5,27,32
3,3,10,13
4,0,1,1


In [6]:

X_train1, X_test1, y_train1, y_test1 = train_test_split(data1, hour_target['casual'], test_size=0.2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(data1, hour_target['registered'], test_size=0.2)
X_train3, X_test3, y_train3, y_test3 = train_test_split(data1, hour_target['cnt'], test_size=0.2)

In [7]:
# Linear Regression Model with default hyper-parameters
lin1 = LinearRegression()
lin2 = LinearRegression()
lin3 = LinearRegression()

# KNN Regression with default hyper-parameters
knn1 = KNeighborsRegressor()
knn2 = KNeighborsRegressor()
knn3 = KNeighborsRegressor()

# Fit the linear regression to our data
lin1.fit(X_train1, y_train1)
lin2.fit(X_train2, y_train2)
lin3.fit(X_train3, y_train3)

# Fit the KNN regression to our data
knn1.fit(X_train1, y_train1)
knn2.fit(X_train2, y_train2)
knn3.fit(X_train3, y_train3)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [8]:
# Getting predicted values on our test data and checking RMSE to see which model is most accurate
lin_pred1 = lin1.predict(X_test1)
lin_pred2 = lin2.predict(X_test2)
lin_pred3 = lin3.predict(X_test3)

knn_pred1 = knn1.predict(X_test1)
knn_pred2 = knn2.predict(X_test2)
knn_pred3 = knn3.predict(X_test3)

print('RMSE for linear regression on casual hourly rides: ' + str(np.sqrt(mean_squared_error(lin_pred1, y_test1))) + ', R^2: ' + str(r2_score(lin_pred1, y_test1)))
print('RMSE for KNN regression on casual hourly rides: ' + str(np.sqrt(mean_squared_error(knn_pred1, y_test1))) + ', R^2: ' + str(r2_score(knn_pred1, y_test1)))
## Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin1.score(X_test1, y_test1))


print('RMSE for linear regression on registered hourly rides: ' + str(np.sqrt(mean_squared_error(lin_pred2, y_test2))) + ', R^2: ' + str(r2_score(lin_pred2, y_test2)))
print('RMSE for KNN regression on registered hourly rides: ' + str(np.sqrt(mean_squared_error(knn_pred2, y_test2))) + ', R^2: ' + str(r2_score(knn_pred2, y_test2)))
## Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin2.score(X_test2, y_test2))


print('RMSE for linear regression on total hourly rides: ' + str(np.sqrt(mean_squared_error(lin_pred3, y_test3))) + ', R^2: ' + str(r2_score(lin_pred3, y_test3)))
print('RMSE for KNN regression on total hourly rides: ' + str(np.sqrt(mean_squared_error(knn_pred3, y_test3))) + ', R^2: ' + str(r2_score(knn_pred3, y_test3)))
## Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin3.score(X_test3, y_test3))



RMSE for linear regression on casual hourly rides: 32.3072644418, R^2: 0.261164179858
RMSE for KNN regression on casual hourly rides: 21.9476778696, R^2: 0.72535829432
Variance score: 0.56
RMSE for linear regression on registered hourly rides: 97.1850464661, R^2: 0.351696767061
RMSE for KNN regression on registered hourly rides: 89.3576573781, R^2: 0.439451937724
Variance score: 0.61
RMSE for linear regression on total hourly rides: 109.155534333, R^2: 0.439834325483
RMSE for KNN regression on total hourly rides: 94.197527081, R^2: 0.606576331049
Variance score: 0.63
