# Basic Models

In [123]:
from common import get_dataset

X_train, y_train, X_test, y_test = get_dataset()

## Median Income as the Only Predictor

In [124]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


# Items here are placeholder to be changed by GridSearchCV param_grid
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()),
])

param_grid = [
    { 'scaler': [StandardScaler(), FunctionTransformer(lambda x: x)] },
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train[['median_income']], y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ......scaler=StandardScaler();, score=-85282.540 total time=   0.0s
[CV 2/5] END ......scaler=StandardScaler();, score=-82301.434 total time=   0.0s
[CV 3/5] END ......scaler=StandardScaler();, score=-83432.254 total time=   0.0s
[CV 4/5] END ......scaler=StandardScaler();, score=-83217.398 total time=   0.0s
[CV 1/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x733c5d1aeac0>);, score=-85282.540 total time=   0.0s
[CV 5/5] END ......scaler=StandardScaler();, score=-83988.052 total time=   0.0s
[CV 2/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x769d2083a020>);, score=-82301.434 total time=   0.0s
[CV 3/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x7dff9aef8680>);, score=-83432.254 total time=   0.0s
[CV 4/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x79d1266ff100>);, score=-83217.398 total time=   0.0s
[CV 5/5] END scaler=FunctionTransformer(func=<

np.float64(83.64433563670974)

In [125]:
import pandas as pd


pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005562,0.00047,0.002721,0.000167,StandardScaler(),,,,,{'scaler': StandardScaler()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
1,0.003636,0.000189,0.002289,0.000383,FunctionTransformer(func=<function <lambda> at...,,,,,{'scaler': FunctionTransformer(func=<function ...,-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
2,0.005584,0.000475,0.003044,0.000177,,LinearRegression(),,,,{'regressor': LinearRegression()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
6,0.03836,0.016146,0.004818,0.00201,,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85287.986888,-82294.679032,-83453.343688,-83200.820573,-84025.84147,-83652.53433,990.106715,4
5,0.04242,0.019237,0.005928,0.003191,,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85408.351667,-82296.746538,-83486.236701,-83237.203784,-83987.401178,-83683.187973,1022.609114,5
10,0.009054,0.000608,0.011586,0.000213,,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-90462.267025,-89942.176901,-88491.815617,-89911.585192,-90623.754055,-89886.319758,751.521757,6
9,4.070227,0.404308,0.122291,0.021252,,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-95210.093962,-97000.441941,-94939.639786,-96095.501544,-98118.581994,-96272.851845,1172.558653,7
8,2.148955,0.16891,0.061791,0.003455,,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-95252.443565,-97483.97088,-94788.834845,-96240.640765,-98281.839558,-96409.545922,1315.598089,8
7,0.405545,0.030406,0.016531,0.000966,,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-96557.212732,-98213.644278,-96542.941019,-97399.755297,-99248.185069,-97592.347679,1033.214155,9
3,14.455749,0.19949,1.917663,0.061765,,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-116360.54463,-112267.954073,-114255.526599,-112996.948377,-113515.819993,-113879.358735,1400.372747,10


In [126]:
model.best_estimator_

## Using All Feature for Prediction

In [127]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .regressor=LinearRegression();, score=-68822.547 total time=   0.0s
[CV 2/5] END .regressor=LinearRegression();, score=-68175.309 total time=   0.1s


[CV 4/5] END .regressor=LinearRegression();, score=-68657.576 total time=   0.1s
[CV 3/5] END .regressor=LinearRegression();, score=-68000.955 total time=   0.1s
[CV 5/5] END .regressor=LinearRegression();, score=-67951.483 total time=   0.1s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68270.699 total time=   0.2s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68826.617 total time=   0.2s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68027.610 total time=   0.2s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68820.352 total time=   0.2s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68225.851 total time=   0.1s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-68808.036 total time=   0.1s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-68184.828 total time=   0.2s
[CV 4/5] END regressor=SGDRegressor(), regr

np.float64(49.350282268958544)

In [128]:
pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,12.698811,0.998558,0.077562,0.009423,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-49525.124436,-49187.016244,-48989.981932,-49575.112676,-49474.176057,-49350.282269,224.917893,1
6,8.451873,0.221029,0.073537,0.01719,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-49384.223399,-49439.244629,-49464.971749,-49551.235789,-49656.870413,-49499.309196,95.449946,2
5,1.842259,0.121596,0.027484,0.003487,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-52326.23765,-51915.992243,-52365.774261,-51647.685479,-52001.641011,-52051.466129,267.642867,3
8,0.078662,0.029624,0.289336,0.112943,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-61356.607488,-62284.950131,-60440.419169,-62050.053393,-62896.858137,-61805.777663,842.103456,4
0,0.046462,0.01521,0.01361,0.001313,LinearRegression(),,,,{'regressor': LinearRegression()},-68822.546519,-68175.309287,-68000.954696,-68657.575745,-67951.482921,-68321.573834,353.561352,5
4,0.14262,0.052047,0.014318,0.004384,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68808.035583,-68184.827663,-68080.864129,-68856.797764,-68204.853102,-68427.075648,333.982314,6
3,0.166561,0.035633,0.018402,0.008602,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68826.616978,-68270.698945,-68027.609556,-68820.351852,-68225.851429,-68434.225752,328.195626,7
1,15.602258,0.063211,2.140753,0.048326,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-115037.662545,-110966.13523,-112979.932217,-111604.306435,-112350.66032,-112587.739349,1400.634542,8
2,19.374288,0.168743,4.859166,0.147286,SVR(),rbf,,,"{'regressor': SVR(), 'regressor__kernel': 'rbf'}",-120712.213006,-116632.242617,-118635.613113,-117152.118175,-117735.471938,-118173.53177,1433.12079,9


In [129]:
model.best_estimator_

## Add Outlier Score

In [130]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


class OutlierScoreTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, contamination='auto', random_state=None):
    self.contamination = contamination
    self.random_state = random_state
    self.isolation_forest = IsolationForest(contamination=self.contamination,
                                            random_state=self.random_state)

  def fit(self, X, y=None):
    # Fit Isolation Forest on the specified features
    self.isolation_forest.fit(X)
    return self

  def transform(self, X):
    # Predict anomaly scores and add them as a new column
    anomaly_scores = self.isolation_forest.decision_function(X)
    return np.hstack([X, anomaly_scores.reshape(-1, 1)])

  def get_feature_names_out(self, input_features=None):
    # Return the names of all original features plus the new one
    if input_features is None:
        raise ValueError("input_features must be provided to get_feature_names_out.")
    return list(input_features) + ['outlier_score']

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_flagger', OutlierScoreTransformer()),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[CV 5/5] END .regressor=LinearRegression();, score=-67946.337 total time=   0.6s
[CV 4/5] END .regressor=LinearRegression();, score=-68658.718 total time=   0.7s
[CV 1/5] END .regressor=LinearRegression();, score=-68824.817 total time=   0.9s
[CV 2/5] END .regressor=LinearRegression();, score=-68182.748 total time=   0.9s
[CV 3/5] END .regressor=LinearRegression();, score=-68003.977 total time=   0.9s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68827.962 total time=   0.9s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68192.301 total time=   1.2s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68097.981 total time=   0.8s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68978.066 total time=   0.8s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68320.926 total time=   0.7s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=2000;, score=-68770.490 to

np.float64(49.723762757570526)

In [131]:
model.best_estimator_

## Drop Outliers

In [132]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


outliers_model = IsolationForest(contamination='auto', random_state=42)
outlier_labels = outliers_model.fit_predict(X_train.select_dtypes(include=['number']))
X_train_no_outliers = X_train[outlier_labels == 1]
y_train_no_outliers = y_train[outlier_labels == 1]

numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessing = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train_no_outliers, y_train_no_outliers.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .regressor=LinearRegression();, score=-64584.788 total time=   0.0s
[CV 2/5] END .regressor=LinearRegression();, score=-64632.123 total time=   0.0s
[CV 3/5] END .regressor=LinearRegression();, score=-66205.763 total time=   0.1s
[CV 4/5] END .regressor=LinearRegression();, score=-65899.615 total time=   0.0s
[CV 5/5] END .regressor=LinearRegression();, score=-64100.509 total time=   0.0s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64662.994 total time=   0.1s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66352.733 total time=   0.2s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64645.126 total time=   0.3s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-66149.186 total time=   0.2s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-64439.610 total time=   0.2s
[CV 1/5] END regressor=SG

np.float64(48.17690455495954)

In [133]:
model.best_estimator_