# Fetch the housing data

In [90]:
import os
import tarfile
import urllib
import numpy as np
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()

# Load the housing_data

In [91]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()

# Splitting of data
Fo this common method of train_test_split could be used.  But for eg, if expert say that median_income is most affecting feature, then the split should be done based on that.  Since it is not categorical, we have to bin it to proceed with this approach

In [92]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [93]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [94]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [95]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

### SimpleImputer will fill missing values with median value of the feature.  It can only be done for numerical attributes, so drop ocean_proximity in this step

In [96]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")


In [97]:
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
housing_num.median().values

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155])

In [98]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

### One hot encoder for text data

In [99]:
from sklearn.preprocessing import OneHotEncoder
housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

### Adding attributes

In [100]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

### Make imputer, attributes addition and standardscalar into a pipeline, the last estimator must be transformers meaning they should have fit_Transform() method

In [101]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

## Applying transformation to all data

In [102]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [103]:
from scipy import stats
from sklearn.metrics import mean_squared_error
def make_prediction_with_grid(grid_search):
    cvres = grid_search.cv_results_
    print(grid_search.best_params_)
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
         print(np.sqrt(-mean_score), params)
         extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    final_model = grid_search.best_estimator_

    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    X_test_prepared = full_pipeline.transform(X_test)

    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
    print(final_rmse)
    confidence = 0.95
    squared_errors = (final_predictions - y_test) ** 2
    print(np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                             loc=squared_errors.mean(),
                             scale=stats.sem(squared_errors))))

### 1. Support Vector Machine

In [104]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = [
    {'kernel': ['linear'], 'C': [1, 10]},
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': ['scale']}
]

svr = SVR()

grid_search = GridSearchCV(svr, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)
make_prediction_with_grid(grid_search)

{'C': 10, 'kernel': 'linear'}
112571.06378605746 {'C': 1, 'kernel': 'linear'}
84649.6069847477 {'C': 10, 'kernel': 'linear'}
118638.40200558837 {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
116126.659130923 {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
80641.57697382719
[77898.8231993  83294.06472142]


# 2. RandomizedSearchCV

In [105]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
param_grid = [
    {'kernel': ['linear'], 'C': [ 1, 10]},
    {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 1]}
]


grid_search = RandomizedSearchCV(svr, param_grid, cv=5,
                           scoring='neg_mean_squared_error', n_iter=2,
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)
make_prediction_with_grid(grid_search)

{'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
118898.89058474178 {'kernel': 'rbf', 'gamma': 1, 'C': 1}
118638.40200558837 {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
116911.25458976308
[113743.25976841 119995.64057398]


### 3. Transformer - Select K Best features

In [121]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_regression

class SelectKBestTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k=5):
        self.k = k
        self.selector = None

    def fit(self, X, y):
        self.selector = SelectKBest(score_func=f_regression, k=self.k)
        self.selector.fit(X, y)
        return self

    def transform(self, X):
        return self.selector.transform(X)

    def get_feature_names_out(self, input_features=None):
        return self.selector.get_feature_names_out(input_features)


num_pipeline = Pipeline([
    
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ('feature_selection', SelectKBestTransformer(k=5)),
])
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing, housing_labels)

final_model = SVR(kernel='rbf',C=10,gamma='scale')

final_model.fit(housing_prepared, housing_labels)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
print(final_rmse)
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
print(np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                               loc=squared_errors.mean(),
                               scale=stats.sem(squared_errors))))

112348.65576898753
[109177.03154473 115433.16979655]


### 4. Single Pipeline

In [122]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())]
    )

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

col_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

full_pipeline = Pipeline([
    ('preprocessor', col_pipeline),# Preprocessing steps
    ('feature_selection', SelectKBest(f_regression, k=5)),
    ('svr', SVR(kernel='rbf',C=10,gamma='scale'))                                        # SVR model for final prediction
])

full_pipeline.fit(housing, housing_labels)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = full_pipeline.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
print(final_rmse)
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
print(np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                               loc=squared_errors.mean(),
                               scale=stats.sem(squared_errors))))

109468.27796355786
[106266.44648992 112579.08380786]


### 5. Preparation options in GridSearchCV

In [123]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import IsolationForest


class IsolationForestTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.1):
        self.contamination = contamination
        self.model = IsolationForest(contamination=self.contamination)

    def fit(self, X, y=None):
        self.model.fit(X)
        return self

    def transform(self, X):
        # Predict outliers and keep only inliers
        inliers = self.model.predict(X) == 1
        return X[inliers]
param_grid = [
    {'kernel': ['linear'], 'C': [1, 10]},
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': ['scale']}
]


num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

full = Pipeline([
    ('preprocessor', full_pipeline),
    ('outlier_detection', IsolationForest()),
])
housing_prepared = full_pipeline.fit_transform(housing)

svr = SVR()

grid_search = GridSearchCV(svr, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)
make_prediction_with_grid(grid_search)




{'C': 10, 'kernel': 'linear'}
112571.06378605746 {'C': 1, 'kernel': 'linear'}
84649.6069847477 {'C': 10, 'kernel': 'linear'}
118638.40200558837 {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
116126.659130923 {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
80641.57697382719
[77898.8231993  83294.06472142]
