# Basic Models

In [1]:
from common import get_dataset

X_train, y_train, X_test, y_test = get_dataset()

## Median Income as the Only Predictor

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


# Items here are placeholder to be changed by GridSearchCV param_grid
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()),
])

param_grid = [
    { 'scaler': [StandardScaler(), FunctionTransformer(lambda x: x)] },
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train[['median_income']], y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ......scaler=StandardScaler();, score=-85282.540 total time=   0.0s
[CV 3/5] END ......scaler=StandardScaler();, score=-83432.254 total time=   0.0s
[CV 4/5] END ......scaler=StandardScaler();, score=-83217.398 total time=   0.0s
[CV 2/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x76de47f8eca0>);, score=-82301.434 total time=   0.0s[CV 3/5] END .regressor=LinearRegression();, score=-83432.254 total time=   0.0s

[CV 4/5] END .regressor=LinearRegression();, score=-83217.398 total time=   0.0s
[CV 5/5] END .regressor=LinearRegression();, score=-83988.052 total time=   0.0s
[CV 5/5] END ......scaler=StandardScaler();, score=-83988.052 total time=   0.0s
[CV 4/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x7c0bfc186ca0>);, score=-83217.398 total time=   0.0s
[CV 5/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x7f6b94d8eca0>);, score=-83988.052 total time=   0.0s
[CV 1/

np.float64(83.64433563670974)

In [3]:
import pandas as pd


pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.018487,0.002818,0.006986,0.003981,StandardScaler(),,,,,{'scaler': StandardScaler()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
1,0.015373,0.00302,0.005523,0.002496,FunctionTransformer(func=<function <lambda> at...,,,,,{'scaler': FunctionTransformer(func=<function ...,-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
2,0.010204,0.004934,0.004523,0.002186,,LinearRegression(),,,,{'regressor': LinearRegression()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
5,0.026719,0.002512,0.003118,0.000104,,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85288.538147,-82292.28144,-83481.176626,-83230.068394,-83997.449275,-83657.902776,985.236983,4
6,0.019508,0.004094,0.003064,7.5e-05,,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85309.500078,-82294.128954,-83446.080337,-83245.131917,-84035.761089,-83666.120475,994.519026,5
10,0.01032,0.001139,0.014998,0.003621,,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-90462.267025,-89942.176901,-88491.815617,-89911.585192,-90623.754055,-89886.319758,751.521757,6
9,4.271842,0.392689,0.129787,0.030448,,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-95064.471024,-97005.729738,-94661.514962,-96159.168578,-97985.649427,-96175.306746,1224.150724,7
8,2.208856,0.173761,0.063237,0.005459,,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-95428.31896,-97531.433466,-94672.630001,-96089.478611,-98021.949283,-96348.762064,1258.739784,8
7,0.430902,0.012197,0.017008,0.001585,,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-97104.955513,-98938.324745,-96238.227344,-97412.04797,-99393.18014,-97817.347142,1175.179051,9
3,15.379985,0.39823,2.037282,0.125917,,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-116360.54463,-112267.954073,-114255.526599,-112996.948377,-113515.819993,-113879.358735,1400.372747,10


In [4]:
model.best_estimator_

## Using All Feature for Prediction

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[CV 1/5] END .regressor=LinearRegression();, score=-68822.547 total time=   0.1s
[CV 2/5] END .regressor=LinearRegression();, score=-68175.309 total time=   0.1s
[CV 5/5] END .regressor=LinearRegression();, score=-67951.483 total time=   0.1s
[CV 3/5] END .regressor=LinearRegression();, score=-68000.955 total time=   0.1s
[CV 4/5] END .regressor=LinearRegression();, score=-68657.576 total time=   0.1s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68782.260 total time=   0.1s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68179.881 total time=   0.3s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68034.860 total time=   0.2s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68796.852 total time=   0.1s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68183.149 total time=   0.1s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-68796.815 to

np.float64(49.51906305731363)

In [6]:
pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,8.818842,0.342342,0.073641,0.015564,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-49901.482653,-49024.831025,-49205.956481,-49750.556753,-49712.488374,-49519.063057,340.455125,1
7,13.470451,0.51894,0.082368,0.003242,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-49688.287031,-49249.42187,-49340.607116,-49658.754471,-49763.705621,-49540.155222,205.099451,2
5,1.896045,0.197843,0.03013,0.003525,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-51675.615536,-51131.276841,-52358.835695,-51808.289724,-52283.149856,-51851.43353,446.146827,3
8,0.065218,0.013469,0.252954,0.018862,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-61356.607488,-62284.950131,-60440.419169,-62050.053393,-62896.858137,-61805.777663,842.103456,4
0,0.049175,0.008496,0.014501,0.004048,LinearRegression(),,,,{'regressor': LinearRegression()},-68822.546519,-68175.309287,-68000.954696,-68657.575745,-67951.482921,-68321.573834,353.561352,5
3,0.154567,0.049041,0.011686,0.000591,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68782.260364,-68179.880867,-68034.859926,-68796.851529,-68183.148531,-68395.400243,326.285948,6
4,0.129394,0.015927,0.013082,0.002131,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68796.815159,-68115.163431,-68028.604978,-69016.02514,-68245.588935,-68440.439529,392.856558,7
1,17.519157,0.43843,2.401479,0.190019,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-115037.662545,-110966.13523,-112979.932217,-111604.306435,-112350.66032,-112587.739349,1400.634542,8
2,21.416794,0.520568,5.027597,0.180154,SVR(),rbf,,,"{'regressor': SVR(), 'regressor__kernel': 'rbf'}",-120712.213006,-116632.242617,-118635.613113,-117152.118175,-117735.471938,-118173.53177,1433.12079,9


In [7]:
model.best_estimator_

## Add Outlier Score

In [8]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


class OutlierScoreTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, contamination='auto', random_state=None):
    self.contamination = contamination
    self.random_state = random_state
    self.isolation_forest = IsolationForest(contamination=self.contamination,
                                            random_state=self.random_state)

  def fit(self, X, y=None):
    # Fit Isolation Forest on the specified features
    self.isolation_forest.fit(X)
    return self

  def transform(self, X):
    # Predict anomaly scores and add them as a new column
    anomaly_scores = self.isolation_forest.decision_function(X)
    return np.hstack([X, anomaly_scores.reshape(-1, 1)])

  def get_feature_names_out(self, input_features=None):
    # Return the names of all original features plus the new one
    if input_features is None:
        raise ValueError("input_features must be provided to get_feature_names_out.")
    return list(input_features) + ['outlier_score']

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_flagger', OutlierScoreTransformer()),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[CV 5/5] END .regressor=LinearRegression();, score=-67962.260 total time=   0.6s
[CV 1/5] END .regressor=LinearRegression();, score=-68831.567 total time=   0.8s
[CV 2/5] END .regressor=LinearRegression();, score=-68206.101 total time=   0.9s
[CV 3/5] END .regressor=LinearRegression();, score=-67964.286 total time=   1.0s
[CV 4/5] END .regressor=LinearRegression();, score=-68637.070 total time=   1.2s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68931.731 total time=   1.0s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68426.869 total time=   0.9s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68859.773 total time=   0.7s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68122.512 total time=   0.9s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68172.213 total time=   0.7s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-68807.005 to

np.float64(49.76395323425239)

In [9]:
model.best_estimator_

## Drop Outliers

In [10]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


outliers_model = IsolationForest(contamination='auto', random_state=42)
outlier_labels = outliers_model.fit_predict(X_train.select_dtypes(include=['number']))
X_train_no_outliers = X_train[outlier_labels == 1]
y_train_no_outliers = y_train[outlier_labels == 1]

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train_no_outliers, y_train_no_outliers.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .regressor=LinearRegression();, score=-64584.788 total time=   0.0s
[CV 2/5] END .regressor=LinearRegression();, score=-64632.123 total time=   0.0s
[CV 3/5] END .regressor=LinearRegression();, score=-66205.763 total time=   0.1s
[CV 5/5] END .regressor=LinearRegression();, score=-64100.509 total time=   0.0s
[CV 4/5] END .regressor=LinearRegression();, score=-65899.615 total time=   0.1s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64548.413 total time=   0.2s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64638.913 total time=   0.3s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66188.311 total time=   0.1s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64380.958 total time=   0.1s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66124.578 total time=   0.2s
[CV 1/5] END regressor=SG

np.float64(48.18032421052097)

In [11]:
model.best_estimator_

## Cluster Similarity Along Coordinates

In [12]:
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import rbf_kernel


class ClusterSimilarityTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, n_clusters=5, gamma=1.0, random_state=None):
    self.n_clusters = n_clusters
    self.gamma = gamma
    self.random_state = random_state
    self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)

  def fit(self, X, y=None):
    # Fit KMeans on the specified features
    self.kmeans.fit(X)
    return self

  def transform(self, X):
    return rbf_kernel(X, self.kmeans.cluster_centers_, gamma=self.gamma)

  def get_feature_names_out(self, names=None):
      return [f"cluster_similarity_{i}" for i in range(self.n_clusters)]

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('cluster_similarity', ClusterSimilarityTransformer(n_clusters=5, random_state=42), ['latitude', 'longitude']),
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 3/5] END .regressor=LinearRegression();, score=-66719.141 total time=   0.1s
[CV 5/5] END .regressor=LinearRegression();, score=-66571.372 total time=   0.2s
[CV 4/5] END .regressor=LinearRegression();, score=-66964.305 total time=   0.2s
[CV 1/5] END .regressor=LinearRegression();, score=-67383.788 total time=   0.2s
[CV 2/5] END .regressor=LinearRegression();, score=-66517.620 total time=   0.2s


[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66575.853 total time=   0.3s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-67402.456 total time=   0.4s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-67062.698 total time=   0.3s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-67269.247 total time=   0.3s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66813.565 total time=   0.2s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-67632.555 total time=   0.3s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-66518.701 total time=   0.3s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-66923.412 total time=   0.3s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-67196.533 total time=   0.2s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-66743.299

np.float64(47.57764460521399)

In [13]:
model.best_estimator_

In [None]:
feature_importance = model.best_estimator_['regressor'].feature_importances_


pd.DataFrame({
    'feature': model.best_estimator_['preprocessing'].get_feature_names_out(),
    'importance': feature_importance
}).sort_values(by='importance', ascending=False)



Unnamed: 0,feature,importance
12,numerical_features__median_income,0.474532
14,categorical_features__ocean_proximity_INLAND,0.142077
6,numerical_features__latitude,0.051161
5,numerical_features__longitude,0.044827
7,numerical_features__housing_median_age,0.044248
3,cluster_similarity__cluster_similarity_3,0.044078
2,cluster_similarity__cluster_similarity_2,0.03464
1,cluster_similarity__cluster_similarity_1,0.033254
0,cluster_similarity__cluster_similarity_0,0.03072
10,numerical_features__population,0.022428


### Test on test set

In [40]:
from sklearn.metrics import root_mean_squared_error


test_predictions = model.predict(X_test)

rmse = root_mean_squared_error(y_test, test_predictions)

rmse / 1000

48.4303798464121

## RandomSearchCV with Median Age Cluster Similarity

In [50]:
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import rbf_kernel
from scipy.stats import randint


class ClusterSimilarityTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, n_clusters=5, gamma=1.0, random_state=None):
    self.n_clusters = n_clusters
    self.gamma = gamma
    self.random_state = random_state
    self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)

  def fit(self, X, y=None):
    # Fit KMeans on the specified features
    self.kmeans.fit(X)
    return self

  def transform(self, X):
    return rbf_kernel(X, self.kmeans.cluster_centers_, gamma=self.gamma)

  def get_feature_names_out(self, names=None):
      return [f"cluster_similarity_{i}" for i in range(self.n_clusters)]

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('coordinates_cluster_similarity', ClusterSimilarityTransformer(n_clusters=5, random_state=42), ['latitude', 'longitude']),
    ('housing_median_age_cluster_similarity', ClusterSimilarityTransformer(n_clusters=5, random_state=42), ['housing_median_age']),
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'preprocessing__coordinates_cluster_similarity__n_clusters': randint(2, 20)},
    {'preprocessing__housing_median_age_cluster_similarity__n_clusters': randint(2, 10)},
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': randint(1000, 2000),
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__max_features': randint(2, 100)
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = RandomizedSearchCV(
  estimator=pipeline,
  param_distributions=param_grid,
  cv=5,
  n_iter=20,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  random_state=42,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000

np.float64(46.271697024322535)

In [52]:
model.best_estimator_['regressor'].feature_importances_

pd.DataFrame({
    'feature': model.best_estimator_['preprocessing'].get_feature_names_out(),
    'importance': model.best_estimator_['regressor'].feature_importances_
}).sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
17,numerical_features__median_income,0.238208
19,categorical_features__ocean_proximity_INLAND,0.087133
10,numerical_features__longitude,0.07817
0,coordinates_cluster_similarity__cluster_simila...,0.072628
11,numerical_features__latitude,0.071768
1,coordinates_cluster_similarity__cluster_simila...,0.067329
3,coordinates_cluster_similarity__cluster_simila...,0.066511
4,coordinates_cluster_similarity__cluster_simila...,0.06412
2,coordinates_cluster_similarity__cluster_simila...,0.060803
13,numerical_features__total_rooms,0.032394
