In [3]:
#importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Data Preprocessing

In [2]:
#reading the data
data = pd.read_csv('hour.csv')

In [3]:
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
#checking for missing values
data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [5]:
#finding categorical columns
catCols = []
for i in data.columns:
    if data[i].dtype == 'object':
        catCols.append(i)
print("Categorical columns are ", catCols)

Categorical columns are  ['dteday']


In [6]:
#dropping the categorical columns
data.drop(catCols, axis = 1, inplace = True)

In [7]:
#dropping instant and cnt columns
data.drop(['instant', 'cnt'], axis = 1, inplace = True)

In [8]:
#adding more features
for col in data.columns:
    if col == 'casual' or col == 'registered':
        continue
    newColSquared = col + '^2'
    newColCubed = col + '^3'
    newColSqrt = col + '^0.5'
    newColCurt = col + '^0.33'
    data[newColSquared] = data[col] ** 2
    data[newColCubed] = data[col] ** 3
    data[newColSqrt] = data[col] ** (0.5)
    data[newColCurt] = data[col] ** (1/3)

In [9]:
#creating a seperate copy for 'casual' dataset and 'registered' dataset
dataCasual = data.copy()
dataRegistered = data.copy()
dataCasual.drop('registered', axis = 1, inplace = True)
dataRegistered.drop('casual', axis = 1, inplace = True)

In [10]:
#computing pairwise correlation
corrMatCasual = dataCasual.corr()
corrMatRegistered = dataRegistered.corr()

In [11]:
#checking correlation between 'casual' column and data
casualInfo = corrMatCasual['casual'].sort_values(ascending = False)
casualInfo

casual      1.000000
temp^2      0.462839
temp        0.459616
atemp^2     0.457776
atemp       0.454080
              ...   
hum^0.33   -0.326064
hum^3      -0.331050
hum^0.5    -0.338252
hum^2      -0.342882
hum        -0.347028
Name: casual, Length: 61, dtype: float64

In [12]:
#checking correlation between 'registered' column and data
registerdInfo = corrMatRegistered['registered'].sort_values(ascending = False)
registerdInfo

registered    1.000000
hr^0.5        0.409746
hr^0.33       0.400490
hr            0.374141
temp          0.335361
                ...   
hum^0.33     -0.251604
hum^0.5      -0.263105
hum^3        -0.272241
hum          -0.273933
hum^2        -0.276670
Name: registered, Length: 61, dtype: float64

In [13]:
#dropping all the columns with correlation less than 0.15 for 'casual' dataset
dropColsCasual = []
for idx, val in zip(casualInfo.index, casualInfo):
    if idx != 'casual':
        if abs(val) < 0.15:
            dropColsCasual.append(idx)
dataCasual.drop(dropColsCasual, axis = 1, inplace = True)

In [14]:
#dropping all the columns with correlation less than 0.15 for 'registered' dataset
dropColsRegistered = []
for idx, val in zip(registerdInfo.index, registerdInfo):
    if idx != 'registered':
        if abs(val) < 0.15:
            dropColsRegistered.append(idx)
dataRegistered.drop(dropColsRegistered, axis = 1, inplace = True)

In [15]:
#splitting the dataCasual dataset
yCasual = dataCasual['casual']
XCasual = dataCasual.drop('casual', axis = 1)
#normalizing the dataset
XCasual = (XCasual - XCasual.mean(axis = 0)) / XCasual.std(axis = 0)
XCasual_train, XCasual_val, yCasual_train, yCasual_val = train_test_split(XCasual, yCasual, random_state = 42)

In [16]:
#converting dataframe to numpy array for dataCasual dataset
XBCasual_train = np.concatenate((XCasual_train.to_numpy(), np.ones((XCasual_train.shape[0], 1))), axis = 1)
XBCasual_val = np.concatenate((XCasual_val.to_numpy(), np.ones((XCasual_val.shape[0], 1))), axis = 1)
yCasual_train = yCasual_train.to_numpy()
yCasual_val = yCasual_val.to_numpy()

In [17]:
#splitting the dataRegistered dataset
yReg = dataRegistered['registered']
XReg = dataRegistered.drop('registered', axis = 1)
#normalizing the dataset
XReg = (XReg - XReg.mean(axis = 0)) / XReg.std(axis = 0)
XReg_train, XReg_val, yReg_train, yReg_val = train_test_split(XReg, yReg, random_state = 42)

In [18]:
#converting dataframe to numpy array for dataRegistered dataset
XBReg_train = np.concatenate((XReg_train.to_numpy(), np.ones((XReg_train.shape[0], 1))), axis = 1)
XBReg_val = np.concatenate((XReg_val.to_numpy(), np.ones((XReg_val.shape[0], 1))), axis = 1)
yReg_train = yReg_train.to_numpy()
yReg_val = yReg_val.to_numpy()

# Modelling

In [19]:
def model(X, y, lr, alpha, epochs, batch_size):
    w = np.random.rand(X.shape[1],)
    for e in range(epochs):
        miniBatchNo = np.random.randint(int(len(X) / batch_size))
        Xi = X[miniBatchNo * batch_size : (miniBatchNo+1) * batch_size]
        yi = y[miniBatchNo * batch_size : (miniBatchNo+1) * batch_size]
        a = np.dot(np.identity(Xi.shape[1]), alpha)
        g = np.dot(np.dot(Xi.T, Xi), w) - np.dot(Xi.T, yi) + np.dot(a,w)
        w -= (lr * g)
        diff = yi - np.dot(Xi, w)
        mse = np.dot(diff.T, diff) / y.shape[0]
#         print("MSE for " + str(e) + " epoch is " + str(mse))
    return w

#  K-fold cross validation

In [20]:
def kcv(k, X, y, lr, alpha, epochs, batch_size):
    dataSize = int(len(X) / k)
    mseOP = np.inf
    for i in range(k):
        X_val = X[i * dataSize:(i+1) * dataSize]
        y_val = y[i * dataSize:(i+1) * dataSize]
        X_train = np.concatenate((X[:i * dataSize], X[(i+1) * dataSize:]), axis = 0)
        y_train = np.concatenate((y[:i * dataSize], y[(i+1) * dataSize:]), axis = 0)
        for b in batch_size:
            for l in lr:
                for a in alpha:
                    w = model(X_train, y_train, l, a, epochs, b)
                    pred = np.dot(X_val, w)
                    diff = y_val - pred
                    mse = np.dot(diff.T, diff) / y_val.shape[0]
                    if mse < mseOP:
                        mseOP = mse
                        op = {
                            'error' : mse,
                            'w' : w,
                            'lr' : l,
                            'alpha' : a,
                            'epochs' : epochs,
                            'batchSize' : b
                        }
    
    return op

In [28]:
#getting optimal parameter values for casual bike dataset
casualParams = kcv(10, XBCasual_train, yCasual_train, [0.0001, 0.00001, 0.000001], [0.5, 1, 5, 10], 10000, [1, 10, 100])
casualParams

{'error': 1098.0733251094864,
 'w': array([ 40.78210304,  -3.22109906,  -0.19127645,   9.2612318 ,
         10.2675932 , -10.79007035,   1.17949859, -38.91005541,
         21.46633126, -16.62081941,   3.10909164,  -3.05098946,
         -3.16822814,  -2.92549187,  -2.78729254,  -1.00442638,
          0.39041853,   0.51421569,  11.31228304,  -3.11616733,
         -0.77478825,  -5.90251916,   6.11629738, -12.79035288,
          4.3196464 ,   1.18050184,  -6.49741228,   4.12046169,
         -3.41306367,   5.59031694,  35.61336852]),
 'lr': 0.0001,
 'alpha': 0.5,
 'epochs': 10000,
 'batchSize': 100}

In [29]:
#getting optimal parameter values for registered bike dataset
regParams = kcv(10, XBReg_train, yReg_train, [0.0001, 0.00001, 0.000001], [0.5, 1, 5, 10], 10000, [1, 10, 100])
regParams

{'error': 10869.806416064517,
 'w': array([   6.48089878,    6.87558213,  141.2938836 ,   19.10123266,
          22.3851206 ,   -9.27216866,    7.59450075,    8.37572358,
           7.15907486,    6.67797598,    6.74464649,    6.64888738,
          -4.01994086,   26.41898417, -137.87850151,   59.1800198 ,
         -56.09957905,   23.00088417,   -8.76296551,   -4.12533125,
         -16.14881818,   10.57318371,  -31.12798348,    9.63074297,
           0.66935022,  -10.15925468,   -7.12386196,   -0.56449124,
          10.7756111 ,  152.92086408]),
 'lr': 0.0001,
 'alpha': 0.5,
 'epochs': 10000,
 'batchSize': 100}

In [30]:
#getting final weights for casual bike dataset
wCasual = model(XBCasual_train, yCasual_train, casualParams['lr'], casualParams['alpha'], casualParams['epochs'], casualParams['batchSize'])
wCasual

array([ 40.64555436,  -2.79711396,  -0.29939725,   9.22887872,
        10.38628545, -11.22334547,   1.26518654, -38.71548411,
        21.55433925, -16.34032612,   2.30582551,  -3.25967719,
        -2.91895285,  -2.76706835,  -3.06481016,  -1.98075684,
         1.00779203,   1.37310822,  11.18312769,  -3.10284326,
        -0.93788105,  -5.78883358,   5.91122552, -12.60416161,
         4.62016424,   0.85930262,  -6.61321795,   3.91521133,
        -3.55773683,   6.24891065,  35.70964181])

In [31]:
#getting final weights for registered bike dataset
wReg = model(XBReg_train, yReg_train, regParams['lr'], regParams['alpha'], regParams['epochs'], regParams['batchSize'])
wReg

array([   5.33014583,    7.43204389,  142.77760733,   19.59078741,
         23.1748703 ,   -8.6201584 ,    7.82614962,    9.40791686,
          7.62119431,    7.58263982,    7.80086353,    7.91121566,
         -2.35559959,   25.9685931 , -138.50292042,   60.18949104,
        -55.18089719,   23.04842997,   -9.15732305,   -4.01848277,
        -16.24126603,   11.52705411,  -31.10921519,   10.25653353,
          0.46234443,  -10.72388567,   -8.7287443 ,    0.33923214,
         11.17413588,  153.77439605])

# Results

In [32]:
#function to calculate MSE
def getScore(w, X, y):
    pred = np.dot(X, w)
    diff = y - pred
    mse = np.dot(diff.T, diff) / y.shape[0]
    return mse

In [33]:
#Score for Casual rental bikes
print("MSE for Casual Rental Bikes is " + str(getScore(wCasual, XBCasual_val, yCasual_val)))

MSE for Casual Rental Bikes is 1164.1368658837316


In [34]:
#Score for Registered rental bikes
print("MSE for Registered Rental Bikes is " + str(getScore(wReg, XBReg_train, yReg_train)))

MSE for Registered Rental Bikes is 12988.86686064716


In [14]:
x = np.array([5, -3, -1, 2])
a = np.array([4, -2, 6, -1])

In [17]:
np.dot(a.T, x)

18