In [114]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

def splitInputOutput(data):
    x, y = np.array(data[:,0:4], dtype=float), np.array(data[:,4:6],dtype=int)
    return x,y

def splitTestTrain(data):
    L = int(data.shape[0]/5)
    x, y = np.array(data[0:L,:], dtype=float), np.array(data[L:,:],dtype=float)
    return x,y


df = pd.read_csv('data_randomized.csv',sep=',',names=["Day", "Temp", "Humidity", "Wind", "Casual", "Registered","Total"],skiprows=1);
df["Day"] = df.transform(lambda x: x/365.)
        
    
polyError = 1e6
rbfError = 1e6
polyAlpha = 0
polyDegree = 0
rbfAlpha = 0
rbfGamma = 0

Test,Train = splitTestTrain(df.as_matrix())
kf = KFold(n_splits=5)

# search for best hyper-parameters
for a in [0.001, 0.01, 0.1, 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0, 20.0]:
    for d in [1,2,3,4,5,6,9,12,15]:
        for train_index, test_index in kf.split(Train):
            X,y = splitInputOutput(Train[train_index])

            krr = KernelRidge(alpha=a,kernel='polynomial', degree=d)
            krr.fit(X,y)

            X,y = splitInputOutput(Train[test_index])
            result = krr.predict(X)

            rms = np.sqrt(0.5*np.sum((y[:,1] - result[:,1])**2)/result.shape[0])
            if rms < polyError:
                polyError = rms
                polyAlpha = a
                polyDegree = d

    for g in [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0]:
        for train_index, test_index in kf.split(Train):
            X,y = splitInputOutput(Train[train_index])

            krr = KernelRidge(alpha=a,kernel='rbf', gamma=g)
            krr.fit(X,y)

            X,y = splitInputOutput(Train[test_index])
            result = krr.predict(X)

            rms = np.sqrt(0.5*np.sum((y[:,1] - result[:,1])**2)/result.shape[0])
            if rms < rbfError:
                rbfError = rms
                rbfAlpha = a
                rbfGamma = g


print()
print("Alpha = ",polyAlpha," Degree = ",polyDegree)
print("RMS for best polynomial: ",polyError)
print()
print("Alpha = ",rbfAlpha," Gamma = ",rbfGamma)
print("RMS for best Gaussian: ",rbfError)

kf = KFold(n_splits=5, shuffle=True)
bestError = 1e6
print()

# use best hyper-parameters to train the model
if rbfError < polyError:
    print("Gaussian model with alpha = ",rbfAlpha,"and gamma = ",rbfGamma)
    for train_index, test_index in kf.split(Train):
        X,y = splitInputOutput(Train[train_index])
        krr = KernelRidge(alpha=rbfAlpha,kernel='rbf', gamma=rbfGamma)
        krr.fit(X,y)
        X,y = splitInputOutput(Train[test_index])
        result = krr.predict(X)
        rms = np.sqrt(0.5*np.sum((y[:,1] - result[:,1])**2)/result.shape[0])
        if rms<bestError:
            bestError = rms
            model = krr
else:
    print("Polynomial model with alpha = ",polyAlpha,"and degree = ",polyDegree)
    for train_index, test_index in kf.split(Train):
        X,y = splitInputOutput(Train[train_index])
        krr = KernelRidge(alpha=polyAlpha,kernel='polynomial', degree=polyDegree)
        krr.fit(X,y)
        X,y = splitInputOutput(Train[test_index])
        result = krr.predict(X)
        rms = np.sqrt(0.5*np.sum((y[:,1] - result[:,1])**2)/result.shape[0])
        if rms<bestError:
            bestError = rms
            model = krr

# apply model to the test data
X,y = splitInputOutput(Test)
result = model.predict(X)
rms = np.sqrt(0.5*np.sum((y[:,0] - result[:,0])**2)/result.shape[0])
print("RMS for non-registered riders (test data): ",rms)
rms = np.sqrt(0.5*np.sum((y[:,1] - result[:,1])**2)/result.shape[0])
print("RMS for registered riders (test data): ",rms)





Alpha =  0.01  Degree =  15
RMS for best polynomial:  463.562661648

Alpha =  0.1  Gamma =  10.0
RMS for best Gaussian:  473.44664461

Polynomial model with alpha =  0.01 and degree =  15
RMS for non-registered riders (test data):  415.870069258
RMS for registered riders (test data):  468.199858535
