## Support Vector Machine (Regression) - Seoul Bike Sharing Data

Carico moduli necessari:

In [2]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from utils.pyutils import rmse
from utils.pyutils import round_pred
from utils.pyutils import print_model_scores

import pickle
import pandas as pd
import numpy as np
import itertools

Carico dati:

In [2]:
bike_train = pd.read_csv("data/bike_train_dummy.csv")
bike_test = pd.read_csv("data/bike_test_dummy.csv")
bike_valid = pd.read_csv("data/bike_valid_dummy.csv")

In [3]:
print(bike_train.shape, bike_test.shape, bike_valid.shape)

(6307, 33) (876, 33) (1577, 33)


## Support Vector Regression - all predictors

Converti i dati in array numpy:

In [3]:
columns = bike_train.columns

X_train, y_train = np.array(bike_train[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_train["rented_bike_count"])
X_test, y_test = np.array(bike_test[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_test["rented_bike_count"])
X_valid, y_valid = np.array(bike_valid[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_valid["rented_bike_count"])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

### Modeling

#### Suppor Vector Regression 1 - linear kernel

In [4]:
bike_SVR1 = svm.SVR(kernel="linear")
bike_SVR1.fit(X_train, y_train)

SVR(kernel='linear')

Save / Load model

In [5]:
pkl_path = "models\\bike_SVR1.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_SVR1, file)

with open(pkl_path, 'rb') as file:
    bike_SVR1 = pickle.load(file)

#### RMSE

In [8]:
print_model_scores(bike_SVR1, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  431.84145325382417
Validation RMSE:  436.97111632198636
Testing RMSE:  439.09611132093727


#### Suppor Vector Regression 2 - rbf kernel

##### Default model

In [9]:
bike_SVR2 = svm.SVR(kernel="rbf")
bike_SVR2.fit(X_train, y_train)

SVR()

In [10]:
print_model_scores(bike_SVR2, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  585.5843898973519
Validation RMSE:  586.3957091511187
Testing RMSE:  592.6727615498329


##### Hyperparameter Search

In [23]:
gamma = [0.01, 0.1, 0.2, 0.5, 1]
C = [0.01, 0.1, 1, 10, 100]
epsilon = [0.05, 0.1, 0.15, 0.2, 0.5]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(gamma, C, epsilon))

In [24]:
for hp in hyper_grid:
    model = svm.SVR(kernel="rbf", gamma=hp[0], C=hp[1], epsilon=hp[2])
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [25]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["gamma", "C", "epsilon"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [3]:
# hyper_grid_df.to_csv("models\\svm_rbf_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\svm_rbf_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,gamma,C,epsilon,train_rmse,valid_rmse
49,49,0.1,100.0,0.5,318.59442,335.006462
48,48,0.1,100.0,0.2,318.592674,335.017029
45,45,0.1,100.0,0.05,318.592677,335.02461
46,46,0.1,100.0,0.1,318.591149,335.025303
47,47,0.1,100.0,0.15,318.588984,335.028075
24,24,0.01,100.0,0.5,363.546404,368.418994
21,21,0.01,100.0,0.1,363.575616,368.431657
23,23,0.01,100.0,0.2,363.564501,368.433728
22,22,0.01,100.0,0.15,363.5715,368.434209
20,20,0.01,100.0,0.05,363.577386,368.434451


##### Fit best model

In [28]:
bike_SVR2_tuned = svm.SVR(kernel="rbf", gamma=0.1, C=100, epsilon=0.5)
bike_SVR2_tuned.fit(X_train, y_train)

SVR(C=100, epsilon=0.5, gamma=0.1)

Save / Load model 

In [30]:
pkl_path = "models\\bike_SVR2_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_SVR2_tuned, file)

with open(pkl_path, 'rb') as file:
    bike_SVR2_tuned = pickle.load(file)

#### RMSE

In [32]:
print_model_scores(bike_SVR2_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  318.59442027407897
Validation RMSE:  335.0064622361973
Testing RMSE:  344.28089970189075


#### Suppor Vector Regression 3 - sigmoid kernel

In [169]:
bike_SVR3 = svm.SVR(kernel="sigmoid")
bike_SVR3.fit(X_train, y_train)

SVR(kernel='sigmoid')

In [170]:
pred = bike_SVR3.predict(X_valid)
mean_squared_error(y_valid, pred) ** 0.5

539.6707047301923

##### Hyperparameter Search

In [49]:
gamma = [0.01, 0.1, 0.2, 0.5, 1]
C = [0.01, 0.1, 1, 10, 100]
epsilon = [0.05, 0.1, 0.15, 0.2, 0.5]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(gamma, C, epsilon))

In [50]:
for hp in hyper_grid:
    model = svm.SVR(kernel="sigmoid", gamma=hp[0], C=hp[1], epsilon=hp[2])
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [51]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["gamma", "C", "epsilon"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [4]:
# hyper_grid_df.to_csv("models\\svm_sigmoid_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\svm_sigmoid_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,gamma,C,epsilon,train_rmse,valid_rmse
24,24,0.01,100.0,0.5,452.807393,450.752335
22,22,0.01,100.0,0.15,452.80249,450.778801
23,23,0.01,100.0,0.2,452.809224,450.783042
21,21,0.01,100.0,0.1,452.910485,450.888577
20,20,0.01,100.0,0.05,452.943177,450.937786
19,19,0.01,10.0,0.5,477.588154,482.202667
16,16,0.01,10.0,0.1,477.624962,482.241735
17,17,0.01,10.0,0.15,477.623065,482.245286
15,15,0.01,10.0,0.05,477.638387,482.25066
18,18,0.01,10.0,0.2,477.635414,482.252697


##### Fit best model

In [53]:
bike_SVR3_tuned = svm.SVR(kernel="sigmoid", gamma=0.01, C=50, epsilon=0.5)
bike_SVR3_tuned.fit(X_train, y_train)

SVR(C=50, epsilon=0.5, gamma=0.01, kernel='sigmoid')

Save / Load model 

In [56]:
pkl_path = "models\\bike_SVR3_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_SVR3_tuned, file)

with open(pkl_path, 'rb') as file:
    bike_SVR3_tuned = pickle.load(file)

#### RMSE

In [58]:
print_model_scores(bike_SVR3_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  445.9335125750761
Validation RMSE:  450.21146444089356
Testing RMSE:  452.0256324307607


#### Support Vector Regression 4 - polynomial kernel

##### Default Model

In [36]:
bike_SVR4 = svm.SVR(kernel="poly", degree=2)
bike_SVR4.fit(X_train, y_train)

SVR(degree=2, kernel='poly')

In [37]:
print_model_scores(bike_SVR4, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  621.7912259612677
Validation RMSE:  622.8708489592332
Testing RMSE:  629.0123300981668


##### Hyperparameter Search

In [38]:
degrees = [2, 3, 4, 5]
C = [0.01, 0.1, 1, 10, 100]
epsilon = [0.05, 0.1, 0.15, 0.2, 0.5]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(degrees, C, epsilon))

In [39]:
for hp in hyper_grid:
    model = svm.SVR(kernel="poly", degree=hp[0], C=hp[1], epsilon=hp[2])
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [41]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["degrees", "C", "epsilon"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [5]:
# hyper_grid_df.to_csv("models\\svm_poly_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\svm_poly_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,degrees,C,epsilon,train_rmse,valid_rmse
46,46,3,100.0,0.1,356.270512,369.1005
45,45,3,100.0,0.05,356.269215,369.103982
48,48,3,100.0,0.2,356.269152,369.104666
49,49,3,100.0,0.5,356.259782,369.104857
47,47,3,100.0,0.15,356.271913,369.110253
74,74,4,100.0,0.5,369.068709,404.620124
73,73,4,100.0,0.2,369.066028,404.703828
72,72,4,100.0,0.15,369.065289,404.727099
71,71,4,100.0,0.1,369.064539,404.740398
70,70,4,100.0,0.05,369.061715,404.754067


##### Fit best model

In [45]:
bike_SVR4_tuned = svm.SVR(kernel="poly", degree=3, C=100, epsilon=0.5)
bike_SVR4_tuned.fit(X_train, y_train)

SVR(C=100, epsilon=0.5, kernel='poly')

Save/Load model

In [140]:
pkl_path = "models\\bike_SVR4_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_SVR2, file)

with open(pkl_path, 'rb') as file:
    bike_SVR4_tuned = pickle.load(file)

#### RMSE

In [48]:
print_model_scores(bike_SVR4_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  356.2597823268401
Validation RMSE:  369.10485730753425
Testing RMSE:  377.59485034169387


## Support Vector Regression - selected predictors

Converti i dati in array numpy:

In [59]:
y_col = "rented_bike_count"
X_col = ["hour", "temperature", "humidity", "functioning_day_Yes", "seasons_Winter", 
"dew_point_temperature", "solar_radiation", "rainfall"]

X_train, y_train = np.array(bike_train[X_col]), np.array(bike_train[y_col])
X_test, y_test = np.array(bike_test[X_col]), np.array(bike_test[y_col])
X_valid, y_valid = np.array(bike_valid[X_col]), np.array(bike_valid[y_col])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

#### Suppor Vector Regression - rbf kernel

##### Default model

In [60]:
bike_SVR5 = svm.SVR(kernel="rbf")
bike_SVR5.fit(X_train, y_train)

SVR()

In [61]:
print_model_scores(bike_SVR5, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  470.84079212621964
Validation RMSE:  472.55705737973193
Testing RMSE:  479.465597444092


##### Hyperparameter Search

In [62]:
gamma = [0.01, 0.1, 0.2, 0.5, 1]
C = [0.01, 0.1, 1, 10, 100]
epsilon = [0.05, 0.1, 0.15, 0.2, 0.5]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(gamma, C, epsilon))

In [63]:
for hp in hyper_grid:
    model = svm.SVR(kernel="rbf", gamma=hp[0], C=hp[1], epsilon=hp[2])
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

Carica hyperparameter grid da csv:

In [64]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["gamma", "C", "epsilon"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

In [6]:
# hyper_grid_df.to_csv("models\\svm5_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\svm5_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,gamma,C,epsilon,train_rmse,valid_rmse
123,123,1.0,100.0,0.2,277.46537,286.647302
124,124,1.0,100.0,0.5,277.462064,286.649331
122,122,1.0,100.0,0.15,277.466526,286.657623
121,121,1.0,100.0,0.1,277.465228,286.661077
120,120,1.0,100.0,0.05,277.457963,286.663608
99,99,0.5,100.0,0.5,295.401647,300.079553
98,98,0.5,100.0,0.2,295.413063,300.085728
96,96,0.5,100.0,0.1,295.423729,300.10852
95,95,0.5,100.0,0.05,295.426565,300.109179
97,97,0.5,100.0,0.15,295.414636,300.112956


#### Fit Best Model

In [67]:
bike_SVR5_tuned = svm.SVR(kernel="rbf", gamma=1.0, C=100, epsilon=0.2)
bike_SVR5_tuned.fit(X_train, y_train)

SVR(C=100, epsilon=0.2, gamma=1.0)

Save/Load model

In [69]:
pkl_path = "models\\bike_SVR5_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_SVR5_tuned, file)

with open(pkl_path, 'rb') as file:
    bike_SVR5_tuned = pickle.load(file)

#### RMSE

In [71]:
print_model_scores(bike_SVR5_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  277.46537000541804
Validation RMSE:  286.6473023892358
Testing RMSE:  280.0651343028168
