<a href="https://colab.research.google.com/github/AnHaiTrinh/handson-ml/blob/main/CaliforniaHousing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

import sklearn

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2)

In [None]:
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

In [None]:
housing_test = test_set.drop("median_house_value", axis=1)
housing_test_labels = test_set["median_house_value"].copy()

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = OneHotEncoder()

full_pipeline = ColumnTransformer([
    ("num", num_transformer, num_attribs),
    ("cat", cat_transformer, cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
svm_reg = SVR(kernel="rbf", C=2)
svm_reg.fit(housing_prepared, housing_labels)
housing_predict_labels = svm_reg.predict(full_pipeline.transform(housing_test))
mean_squared_error(housing_test_labels, housing_predict_labels)

13340823316.367018

In [None]:
from sklearn.model_selection import GridSearchCV
params_grid = {"kernel":["linear", "rbf"],
               "C": np.arange(0, 12, 2),
               "gamma":np.linspace(0, 2, 10)}
svm = SVR()
grid_search = GridSearchCV(svm, params_grid, cv=3, scoring="neg_mean_squared_error", 
                           return_train_score=True, n_jobs=-1, verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV] END ......................C=0, gamma=0.0, kernel=linear; total time=   0.0s
[CV] END ......................C=0, gamma=0.0, kernel=linear; total time=   0.0s
[CV] END ......................C=0, gamma=0.0, kernel=linear; total time=   0.0s
[CV] END .........................C=0, gamma=0.0, kernel=rbf; total time=   0.0s
[CV] END .........................C=0, gamma=0.0, kernel=rbf; total time=   0.0s
[CV] END .........................C=0, gamma=0.0, kernel=rbf; total time=   0.0s
[CV] END .......C=0, gamma=0.2222222222222222, kernel=linear; total time=   0.0s
[CV] END .......C=0, gamma=0.2222222222222222, kernel=linear; total time=   0.0s
[CV] END .......C=0, gamma=0.2222222222222222, kernel=linear; total time=   0.0s
[CV] END ..........C=0, gamma=0.2222222222222222, kernel=rbf; total time=   0.0s
[CV] END ..........C=0, gamma=0.2222222222222222, kernel=rbf; total time=   0.0s
[CV] END ..........C=0, gamma=0.22222222222222

[CV] END ..........C=2, gamma=1.3333333333333333, kernel=rbf; total time=  27.5s
[CV] END .......C=2, gamma=1.5555555555555554, kernel=linear; total time=  13.5s
[CV] END .......C=2, gamma=1.5555555555555554, kernel=linear; total time=33.1min
[CV] END .......C=2, gamma=1.5555555555555554, kernel=linear; total time=  13.4s
[CV] END ..........C=2, gamma=1.5555555555555554, kernel=rbf; total time=  27.9s
[CV] END ..........C=2, gamma=1.5555555555555554, kernel=rbf; total time=  27.9s
[CV] END ..........C=2, gamma=1.5555555555555554, kernel=rbf; total time=  28.4s
[CV] END .......C=2, gamma=1.7777777777777777, kernel=linear; total time=  13.3s
[CV] END .......C=2, gamma=1.7777777777777777, kernel=linear; total time=  13.5s
[CV] END .......C=2, gamma=1.7777777777777777, kernel=linear; total time=  13.6s
[CV] END ..........C=2, gamma=1.7777777777777777, kernel=rbf; total time=  27.1s
[CV] END ..........C=2, gamma=1.7777777777777777, kernel=rbf; total time=  29.5s
[CV] END ..........C=2, gamm

[CV] END ..........C=6, gamma=0.6666666666666666, kernel=rbf; total time=   7.6s
[CV] END .......C=6, gamma=0.8888888888888888, kernel=linear; total time=   3.5s
[CV] END .......C=6, gamma=0.8888888888888888, kernel=linear; total time=   3.6s
[CV] END .......C=6, gamma=0.8888888888888888, kernel=linear; total time=   3.7s
[CV] END ..........C=6, gamma=0.8888888888888888, kernel=rbf; total time=   7.5s
[CV] END ..........C=6, gamma=0.8888888888888888, kernel=rbf; total time=   7.5s
[CV] END ..........C=6, gamma=0.8888888888888888, kernel=rbf; total time=   7.4s
[CV] END .......C=6, gamma=1.1111111111111112, kernel=linear; total time=   3.5s
[CV] END .......C=6, gamma=1.1111111111111112, kernel=linear; total time=   3.4s
[CV] END .......C=6, gamma=1.1111111111111112, kernel=linear; total time=   3.5s
[CV] END ..........C=6, gamma=1.1111111111111112, kernel=rbf; total time=  12.7s
[CV] END ..........C=6, gamma=1.1111111111111112, kernel=rbf; total time=   8.0s
[CV] END ..........C=6, gamm

[CV] END ......C=10, gamma=0.2222222222222222, kernel=linear; total time=   3.5s
[CV] END ......C=10, gamma=0.2222222222222222, kernel=linear; total time=   3.6s
[CV] END ......C=10, gamma=0.2222222222222222, kernel=linear; total time=   3.6s
[CV] END .........C=10, gamma=0.2222222222222222, kernel=rbf; total time=   7.7s
[CV] END .........C=10, gamma=0.2222222222222222, kernel=rbf; total time=   7.7s
[CV] END .........C=10, gamma=0.2222222222222222, kernel=rbf; total time=   7.8s
[CV] END ......C=10, gamma=0.4444444444444444, kernel=linear; total time=   3.5s
[CV] END ......C=10, gamma=0.4444444444444444, kernel=linear; total time=   3.5s
[CV] END ......C=10, gamma=0.4444444444444444, kernel=linear; total time=   3.5s
[CV] END .........C=10, gamma=0.4444444444444444, kernel=rbf; total time=   7.9s
[CV] END .........C=10, gamma=0.4444444444444444, kernel=rbf; total time=   7.7s
[CV] END .........C=10, gamma=0.4444444444444444, kernel=rbf; total time=   7.7s
[CV] END ......C=10, gamma=0

90 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Thinkpad\jupyter\handson_ML\env\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Thinkpad\jupyter\handson_ML\env\lib\site-packages\sklearn\svm\_base.py", line 237, in fit
    raise ValueError(msg)
ValueError: gamma value must be > 0; 0.0 is invalid. Use a positive number or use 'auto' to set gamma to a value of 1 / n_features.

--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (

In [None]:
grid_search.best_params_

{'C': 10, 'gamma': 0.2222222222222222, 'kernel': 'linear'}

In [None]:
housing_pred = grid_search.best_estimator_.predict(full_pipeline.transform(housing_test))
mean_squared_error(housing_pred, housing_test_labels)

6488048900.022727

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
params_dists = {"kernel":["linear", "rbf"],
               "C": reciprocal(0.5, 10),
               "gamma": expon(scale=1.0)
              }
rnd_search = RandomizedSearchCV(svm, params_dists, n_iter=50, cv=5, 
                                scoring="neg_mean_squared_error", n_jobs=-1, return_train_score=True)
rnd_search.fit(housing_prepared, housing_labels)

In [None]:
rnd_search.best_params_

{'C': 9.230035192340598, 'gamma': 2.150822659390505, 'kernel': 'linear'}

In [None]:
housing_pred = rnd_search.best_estimator_.predict(full_pipeline.transform(housing_test))
mean_squared_error(housing_pred, housing_test_labels)

6614528672.586175