# Basic Models

In [16]:
from common import get_dataset

X_train, y_train, X_test, y_test = get_dataset()

## Median Income as the Only Predictor

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


# Items here are placeholder to be changed by GridSearchCV param_grid
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()),
])

param_grid = [
    { 'scaler': [StandardScaler(), FunctionTransformer(lambda x: x)] },
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train[['median_income']], y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 4/5] END ......scaler=StandardScaler();, score=-83217.398 total time=   0.0s
[CV 3/5] END .regressor=LinearRegression();, score=-83432.254 total time=   0.0s
[CV 2/5] END ......scaler=StandardScaler();, score=-82301.434 total time=   0.0s
[CV 4/5] END .regressor=LinearRegression();, score=-83217.398 total time=   0.0s
[CV 5/5] END ......scaler=StandardScaler();, score=-83988.052 total time=   0.0s
[CV 5/5] END .regressor=LinearRegression();, score=-83988.052 total time=   0.0s
[CV 1/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x70bbd3396ca0>);, score=-85282.540 total time=   0.0s
[CV 1/5] END .regressor=LinearRegression();, score=-85282.540 total time=   0.0s
[CV 1/5] END ......scaler=StandardScaler();, score=-85282.540 total time=   0.0s
[CV 2/5] END scaler=FunctionTransformer(func=<function <lambda> at 0x757bca386ca0>);, score=-82301.434 total time=   0.0s
[CV 4/5] END scaler=FunctionTransformer(func=<f

np.float64(83.64433563670974)

In [21]:
import pandas as pd


pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020573,0.006523,0.0053,0.001986,StandardScaler(),,,,,{'scaler': StandardScaler()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
1,0.017736,0.003623,0.00545,0.000674,FunctionTransformer(func=<function <lambda> at...,,,,,{'scaler': FunctionTransformer(func=<function ...,-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
2,0.01586,0.010516,0.004157,0.00114,,LinearRegression(),,,,{'regressor': LinearRegression()},-85282.539998,-82301.433834,-83432.254147,-83217.39839,-83988.051815,-83644.335637,983.106664,1
6,0.033711,0.010928,0.005284,0.003386,,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85286.002718,-82301.306001,-83482.277091,-83261.493674,-83995.110355,-83665.237968,979.02898,4
5,0.033682,0.009022,0.004512,0.001838,,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-85280.252832,-82334.324386,-83482.558583,-83290.323962,-83989.155282,-83675.323009,965.189114,5
10,0.016781,0.004637,0.020107,0.006023,,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-90462.267025,-89942.176901,-88491.815617,-89911.585192,-90623.754055,-89886.319758,751.521757,6
9,4.261766,0.516691,0.128622,0.031025,,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-95222.831828,-97266.120062,-94701.453057,-95787.287194,-98046.248142,-96204.788057,1258.249737,7
8,2.062187,0.074324,0.065537,0.004504,,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-95263.215905,-97203.083405,-94838.051772,-96124.674506,-97792.344407,-96244.273999,1119.82833,8
7,0.432609,0.022682,0.020499,0.008274,,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-96201.12541,-97861.127924,-95720.154142,-97619.121971,-99594.222361,-97399.150362,1366.341389,9
3,14.620004,0.208442,2.102802,0.155913,,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-116360.54463,-112267.954073,-114255.526599,-112996.948377,-113515.819993,-113879.358735,1400.372747,10


In [22]:
model.best_estimator_

## Using All Feature for Prediction

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV


numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_features_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

column_transformer = ColumnTransformer([
    ('numerical_features', numerical_features_pipeline, X_train.select_dtypes(include=['number']).columns),
    ('categorical_features', categorical_features_pipeline, ['ocean_proximity']),
])

pipeline = Pipeline([
    ('preprocessing', column_transformer),
    # regressor placeholder to be changed by GridSearchCV param_grid
    ('regressor', LinearRegression()),
])

param_grid = [
    {'regressor': [LinearRegression()]},
    {
      'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
    },
    {
      'regressor': [SGDRegressor()],
        'regressor__max_iter': [1000, 2000],
    },
    {
      'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [10, 50, 100]
    },
    { 'regressor': [KNeighborsRegressor()] }
]

model = GridSearchCV(
  estimator=pipeline,
  param_grid=param_grid,
  cv=5,
  scoring='neg_root_mean_squared_error',
  n_jobs=-1,
  verbose=3,
)

model.fit(X_train, y_train.values.ravel())

-model.best_score_ / 1000

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .regressor=LinearRegression();, score=-68822.547 total time=   0.1s
[CV 2/5] END .regressor=LinearRegression();, score=-68175.309 total time=   0.1s
[CV 4/5] END .regressor=LinearRegression();, score=-68657.576 total time=   0.1s
[CV 3/5] END .regressor=LinearRegression();, score=-68000.955 total time=   0.1s
[CV 5/5] END .regressor=LinearRegression();, score=-67951.483 total time=   0.1s
[CV 2/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68152.653 total time=   0.2s
[CV 1/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68810.877 total time=   0.3s
[CV 3/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68165.175 total time=   0.2s
[CV 4/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68846.957 total time=   0.2s
[CV 5/5] END regressor=SGDRegressor(), regressor__max_iter=1000;, score=-68206.693 total time=   0.2s
[CV 1/5] END regressor=SG

np.float64(49.438185606079195)

In [32]:
pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,param_regressor__kernel,param_regressor__max_iter,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,13.655305,0.921096,0.078354,0.007513,RandomForestRegressor(),,,100.0,"{'regressor': RandomForestRegressor(), 'regres...",-49620.600581,-49335.439848,-49115.769134,-49445.392606,-49673.725861,-49438.185606,201.65091,1
6,10.666056,0.81519,0.087652,0.02554,RandomForestRegressor(),,,50.0,"{'regressor': RandomForestRegressor(), 'regres...",-49706.503649,-49194.996933,-49366.631892,-49554.497262,-50015.975052,-49567.720957,282.746263,2
5,1.709981,0.117674,0.033225,0.009597,RandomForestRegressor(),,,10.0,"{'regressor': RandomForestRegressor(), 'regres...",-51886.057765,-51906.425662,-51893.120275,-51726.894357,-51753.808534,-51833.261319,76.61626,3
8,0.075936,0.018656,0.247074,0.042251,KNeighborsRegressor(),,,,{'regressor': KNeighborsRegressor()},-61356.607488,-62284.950131,-60440.419169,-62050.053393,-62896.858137,-61805.777663,842.103456,4
0,0.059404,0.014414,0.012494,0.00132,LinearRegression(),,,,{'regressor': LinearRegression()},-68822.546519,-68175.309287,-68000.954696,-68657.575745,-67951.482921,-68321.573834,353.561352,5
3,0.212449,0.060633,0.021029,0.007315,SGDRegressor(),,1000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68810.877187,-68152.652751,-68165.17493,-68846.956606,-68206.693142,-68436.470923,321.132572,6
4,0.164062,0.038751,0.017295,0.005503,SGDRegressor(),,2000.0,,"{'regressor': SGDRegressor(), 'regressor__max_...",-68920.124976,-68250.502472,-68023.100572,-68907.474917,-68306.390287,-68481.518645,365.513284,7
1,19.691417,0.965479,2.760801,0.17241,SVR(),linear,,,"{'regressor': SVR(), 'regressor__kernel': 'lin...",-115037.662545,-110966.13523,-112979.932217,-111604.306435,-112350.66032,-112587.739349,1400.634542,8
2,22.971413,1.010081,5.391709,0.567111,SVR(),rbf,,,"{'regressor': SVR(), 'regressor__kernel': 'rbf'}",-120712.213006,-116632.242617,-118635.613113,-117152.118175,-117735.471938,-118173.53177,1433.12079,9


In [33]:
model.best_estimator_