# Rozdział 2 

1. Wypróbuj regresor maszyny wektorów nośnych (sklearn.svm.SVR) przy użyciu różnych hiperparametrów, takich jak kernel="linear" (oraz różnych wartości hiperparametru C) lub kernel="rbf" (oraz różnych wartości hiperparametrów C i gamma). Na razie nie przejmuj się tym, że nie wiesz, do czego te hiperparametry służą. Jak się spisuje najlepszy predyktor maszyny wektorów nośnych?

We import the data and quickly move to the part where we use an algorithm. That code was discussed in the other notebook.

In [6]:
import pandas as pd
housing = pd.read_csv("housing.csv")

import numpy as np
housing["income_cat"] = pd.cut(housing["median_income"],
 bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
 labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
 strat_train_set = housing.loc[train_index]
 strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
 set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.copy()

housing["Pokoje_na_rodzinę"] = housing["total_rooms"]/housing["households"]
housing["Sypialnie_na_pokoje"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["Populacja_na_rodzinę"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

from sklearn.preprocessing import OneHotEncoder
housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()
cat_encoder.categories_

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
 def __init__(self, add_bedrooms_per_room = True): # Żadnych zmiennych *args ani **kargs
     self.add_bedrooms_per_room = add_bedrooms_per_room
 def fit(self, X, y=None):
     return self # Nie robi nic innego
 def transform(self, X):
     Pokoje_na_rodzinę = X[:, rooms_ix] / X[:, households_ix]
     Populacja_na_rodzinę = X[:, population_ix] / X[:, households_ix]
     if self.add_bedrooms_per_room:
         Sypialnie_na_pokoje = X[:, bedrooms_ix] / X[:, rooms_ix]
         return np.c_[X, Pokoje_na_rodzinę, Populacja_na_rodzinę, Sypialnie_na_pokoje]
     else:
         return np.c_[X, Pokoje_na_rodzinę, Populacja_na_rodzinę]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler()),
 ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs),
 ])
housing_prepared = full_pipeline.fit_transform(housing)

With simple $\textbf{scikit-learn}$ syntax we can easily use different model. Here we try support vector machine.

In [14]:
from sklearn.svm import SVR
svr_lin = SVR(kernel = "linear")
svr_lin.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions with linear kernel:", svr_lin.predict(some_data_prepared))

from sklearn.metrics import mean_squared_error
housing_predictions_lin = svr_lin.predict(housing_prepared)
svr_lin_mse = mean_squared_error(housing_labels, housing_predictions_lin)
svr_lin_rmse = np.sqrt(svr_lin_mse)
print("RMSE for linear kernel:", svr_lin_rmse)

svr_rbf = SVR(kernel = "rbf")
svr_rbf.fit(housing_prepared, housing_labels)

print("Predictions with rbf kernel:", svr_rbf.predict(some_data_prepared))
print("True values:", list(some_labels))

housing_predictions_rbf = svr_rbf.predict(housing_prepared)
svr_rbf_mse = mean_squared_error(housing_labels, housing_predictions_rbf)
svr_rbf_rmse = np.sqrt(svr_rbf_mse)

print("Predictions of some values with linear kernel:", svr_lin.predict(some_data_prepared))
print("Predictions of some values with rbf kernel:", svr_rbf.predict(some_data_prepared))
print("True values of predictions above:", list(some_labels))
print("RMSE for linear kernel:", svr_lin_rmse)
print("RMSE for rbf kernel:" ,svr_rbf_rmse)


Predictions with linear kernel: [175421.01071251 190268.74555128 175653.26689489 165556.9789596
 183911.42788418]
RMSE for linear kernel: 111094.6308539982
Predictions with rbf kernel: [179072.49583029 179914.66366523 179110.86627073 178283.07121413
 179552.22286513]
True values: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
Predictions of some values with linear kernel: [175421.01071251 190268.74555128 175653.26689489 165556.9789596
 183911.42788418]
Predictions of some values with rbf kernel: [179072.49583029 179914.66366523 179110.86627073 178283.07121413
 179552.22286513]
True values of predictions above: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
RMSE for linear kernel: 111094.6308539982
RMSE for rbf kernel: 118580.68301157995


Now we will briefly search for best hyperparameters of this model.

In [28]:
from sklearn.model_selection import GridSearchCV
param_grid = param_grid = [
 {'kernel': ['linear'], 'C': [0.1, 0.5, 1, 2]},
 {'kernel': ['rbf'], 'C': [0.1, 0.5, 1, 2], 'gamma': [0.1, 0.2, 0.5, 1]},
]
svr_reg = SVR()
grid_search = GridSearchCV(svr_reg, param_grid, cv=5,
 scoring='neg_mean_squared_error',
 return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_

SVR(C=2, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [20]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

87703.7515098152 {'C': 8, 'kernel': 'linear'}
116681.25000751972 {'C': 8, 'kernel': 'rbf'}
82383.53727524489 {'C': 12, 'kernel': 'linear'}
115593.80141382401 {'C': 12, 'kernel': 'rbf'}
80017.67895855475 {'C': 16, 'kernel': 'linear'}
114539.66374455532 {'C': 16, 'kernel': 'rbf'}
78086.24318605234 {'C': 20, 'kernel': 'linear'}
113501.46927897457 {'C': 20, 'kernel': 'rbf'}


2. Spróbuj zastąpić klasę GridSearchCV obiektem RandomizedSearchCV

From previous exercise we see, that results for rbf kernel are consistently bad, but results for linear kernel get better with increasing of parameter C. Let us use $\textbf{sklearn.model_selection.RandomisedSearchCV}$ to search for $C \wave U(20, 28)$.

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
svr_rndm = SVR()
distributions = {"C": uniform(loc = 20, scale = 20), "kernel": ["linear"]} #no gamma for rbf, needs a fix
rndmcv = RandomizedSearchCV(svr_rndm, distributions, random_state=0,  
                            scoring='neg_mean_squared_error', return_train_score=True)
rndmcv.fit(housing_prepared, housing_labels)
rndmcv.best_params_

{'C': 39.27325521002059, 'kernel': 'linear'}

In [27]:
cvres = rndmcv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

-0.5762074777370085 {'C': 30.976270078546495, 'kernel': 'linear'}
-0.5820809292115182 {'C': 34.30378732744839, 'kernel': 'linear'}
-0.5783479838922129 {'C': 32.05526752143288, 'kernel': 'linear'}
-0.5760619321277074 {'C': 30.897663659937937, 'kernel': 'linear'}
-0.5715959792741946 {'C': 28.473095986778095, 'kernel': 'linear'}
-0.5796889401877381 {'C': 32.91788226133312, 'kernel': 'linear'}
-0.5722300139154649 {'C': 28.75174422525385, 'kernel': 'linear'}
-0.5869570101787996 {'C': 37.835460015641594, 'kernel': 'linear'}
-0.5888510788693587 {'C': 39.27325521002059, 'kernel': 'linear'}
-0.569781688399187 {'C': 27.668830376515555, 'kernel': 'linear'}
