you can skip the following since it's just dataset preprocessing

In [1]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
 os.makedirs(housing_path, exist_ok=True)
 tgz_path = os.path.join(housing_path, "housing.tgz")
 urllib.request.urlretrieve(housing_url, tgz_path)
 housing_tgz = tarfile.open(tgz_path)
 housing_tgz.extractall(path=housing_path)
 housing_tgz.close()

In [2]:
import pandas as pd
import numpy as np
def load_housing_data(housing_path=HOUSING_PATH):
 csv_path = os.path.join(housing_path, "housing.csv")
 return pd.read_csv(csv_path)

In [3]:
fetch_housing_data() #fetching data
housing = load_housing_data()

In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [5]:
housing["income_cat"] = pd.cut(housing["median_income"],
 bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
 labels=[1, 2, 3, 4, 5])

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
 strat_train_set = housing.loc[train_index]
 strat_test_set = housing.loc[test_index]

In [7]:
for set_ in (strat_train_set, strat_test_set):
 set_.drop("income_cat", axis=1, inplace=True)

In [8]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

## Custom Transformers

Let's create a custom Transfomer

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [housing.columns.get_loc(c) for c in col_names] ## dynamic name extraction

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs , hyperparameters
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing.values

array([[-121.46, 38.52, 29.0, ..., 706.0, 2.1736, 'INLAND'],
       [-117.23, 33.09, 7.0, ..., 768.0, 6.3373, 'NEAR OCEAN'],
       [-119.04, 35.37, 44.0, ..., 300.0, 2.875, 'INLAND'],
       ...,
       [-122.72, 38.44, 48.0, ..., 172.0, 3.1797, '<1H OCEAN'],
       [-122.7, 38.31, 14.0, ..., 501.0, 4.1964, '<1H OCEAN'],
       [-122.14, 39.97, 27.0, ..., 197.0, 3.1319, 'INLAND']], dtype=object)

The previous code create a transformer that create different new features using the BaseEstimator and TransformMixin functions

## Transfomation Pipline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

housing_num = housing.drop("ocean_proximity", axis=1)


num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler()),
 ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

There is a way to both treat numerical and categorical data in one shot 

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs),
 ])
housing_prepared = full_pipeline.fit_transform(housing)


## 1- Using SVM model on the data 

### GridSearchCV

Using the SVR estimator

In [13]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV 

SVR_reg= SVR()
params= [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
svm_grid_search = GridSearchCV(SVR_reg,params, cv=5 ,scoring="neg_mean_squared_error",return_train_score=True)

Disclaimer: it may take an eternity lol!

In [14]:
svm_grid_search.fit(housing_prepared, housing_labels)

In [15]:
svm_grid_search.best_params_

{'C': 1000, 'kernel': 'linear'}

In [16]:
svm_grid_search.best_estimator_

In [17]:
 cvres = svm_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

112571.06378605746 {'C': 1, 'kernel': 'linear'}
84649.6069847477 {'C': 10, 'kernel': 'linear'}
71635.55363120146 {'C': 100, 'kernel': 'linear'}
70396.4975685597 {'C': 1000, 'kernel': 'linear'}
118924.31070087965 {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
118938.30756431246 {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
118795.07721129213 {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
118923.98270894501 {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
117620.69007715455 {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
118792.37077576606 {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
107354.76152040453 {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
117591.85547157719 {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}


As we can see the best result given by the SVR estimator gives a 70369 RMSE error

### RandomizedSearchCV

In [35]:
from sklearn.model_selection import RandomizedSearchCV
params = {
    'C': np.logspace(-3, 2, 6),
    'kernel': ["linear","rbf"],
    'gamma': np.logspace(-3, 2, 6)
}
random_search = RandomizedSearchCV(SVR_reg,params, cv=5 ,scoring="neg_mean_squared_error",random_state=1,return_train_score=True)

Disclaimer: it may take an eternity lol!

In [37]:
random_search.fit(housing_prepared, housing_labels)

In [39]:
random_search.best_params_

{'kernel': 'linear', 'gamma': 10.0, 'C': 100.0}

In [40]:
random_search.best_estimator_

Let's see How the estimator we got from randomized search 

In [41]:
 cvres = random_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

118939.4732834471 {'kernel': 'rbf', 'gamma': 1.0, 'C': 0.01}
118591.6498917307 {'kernel': 'rbf', 'gamma': 1.0, 'C': 10.0}
118931.92012214728 {'kernel': 'linear', 'gamma': 100.0, 'C': 0.001}
71635.55363120146 {'kernel': 'linear', 'gamma': 10.0, 'C': 100.0}
116181.25173057283 {'kernel': 'rbf', 'gamma': 0.1, 'C': 10.0}
118258.82445795466 {'kernel': 'linear', 'gamma': 0.01, 'C': 0.1}
118819.34364522224 {'kernel': 'rbf', 'gamma': 0.01, 'C': 1.0}
84649.6069847477 {'kernel': 'linear', 'gamma': 1.0, 'C': 10.0}
118939.81287990055 {'kernel': 'rbf', 'gamma': 10.0, 'C': 0.1}
118938.89503192669 {'kernel': 'rbf', 'gamma': 100.0, 'C': 10.0}


As we can see the best estimator has RMSE equal to 71635 

## 3. Costum Transformer 

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

The above transformer only does extract the k indices of the most important features of the dataset. It does not compute the feature importances by itself. Therefore, we need to create a pipeline that computes the feature importances first, then feeds them to the TopFeatureSelector: