## Define Constants Data

In [1]:
import os
import random

random_seed = 32
random.seed(random_seed)

## Load Data

In [2]:
import pandas as pd

housing = pd.read_csv("../datasets/housing/housing.csv")

## Analyzing the Data

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
housing["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

## Define Target and Explanatory Variables

In [7]:
target_variable = "median_house_value"
explanatory_variables = [var for var in housing.columns.values if var != "median_house_value"]

print(target_variable)
print(explanatory_variables)

median_house_value
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']


## Define Training and Test Sets

In [8]:
X = housing[explanatory_variables]
y = housing[target_variable]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16512, 9)
(16512,)
(4128, 9)
(4128,)


## Define Pre-processing Pipeline

In [11]:
categorical_variables = ["ocean_proximity"]
numerical_variables = [var for var in explanatory_variables if var not in categorical_variables]

print(categorical_variables)
print(numerical_variables)

['ocean_proximity']
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler())
])

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessing_pipeline = ColumnTransformer([
    ("numerical_pipeline", numerical_pipeline, numerical_variables),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), categorical_variables)
])

In [35]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_train_transformed

array([[-0.61053778,  1.34571519, -0.44773646, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82224293, -0.93015644,  0.50551543, ...,  0.        ,
         0.        ,  0.        ],
       [-1.34440107,  1.01791475, -1.48042601, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.78871286,  1.63605272,  0.34664011, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91007382,  0.34826528,  0.82326606, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.842212  , -0.87864494,  0.42607777, ...,  0.        ,
         0.        ,  0.        ]])

In [36]:
X_train_transformed.shape

(16512, 13)

In [37]:
import numpy as np

np.sum(np.isnan(X_train_transformed))

0

## Define Regressor Pipeline

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

regressor_pipeline = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("regressor", RandomForestRegressor())
])

## Train the model

In [39]:
regressor_pipeline.fit(X_train, y_train)

In [54]:
print(np.sqrt(mse(regressor_pipeline.predict(X_train), y_train)))

18116.59848539552


## Fine Tuning Parameters

In [79]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

params = {
    "regressor__n_estimators": [10, 50, 100],
    "regressor__max_features": [1, 2, 5]
}

#random_params = {
#    'regressor__n_estimators': randint(4,200),
#    'regressor__max_features': randint(1, 5)
#}

regressor_search_cv = GridSearchCV(estimator=regressor_pipeline,
                                   param_grid=params,
                                   cv=5,
                                   scoring="neg_mean_squared_error")

#regressor_search_cv = RandomizedSearchCV(estimator=regressor_pipeline,
#                                         param_distributions=random_params,
#                                         cv=5,
#                                         scoring="neg_mean_squared_error",
#                                         random_state=random_seed)

In [80]:
regressor_search_cv.fit(X_train, y_train)

In [81]:
regressor_search_cv.cv_results_

{'mean_fit_time': array([0.09124398, 0.40412788, 0.78540912, 0.11003761, 0.53439126,
        1.06235485, 0.21743417, 1.06288447, 2.12137594]),
 'std_fit_time': array([0.01612611, 0.01884083, 0.00552054, 0.00122399, 0.00580056,
        0.01078837, 0.00839962, 0.02346336, 0.05594433]),
 'mean_score_time': array([0.00784168, 0.03128915, 0.06157756, 0.00772824, 0.03130903,
        0.06007066, 0.00788617, 0.03109388, 0.05996904]),
 'std_score_time': array([0.00041351, 0.0002633 , 0.00154834, 0.00013631, 0.00022241,
        0.00026755, 0.00024139, 0.00014589, 0.00058621]),
 'param_regressor__max_features': masked_array(data=[1, 1, 1, 2, 2, 2, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_regressor__n_estimators': masked_array(data=[10, 50, 100, 10, 50, 100, 10, 50, 100],
              mask=[False, False, False, False, False, False, False, False,
                    

In [82]:
regressor_search_cv.best_params_

{'regressor__max_features': 5, 'regressor__n_estimators': 100}

In [83]:
np.sqrt(-1 * regressor_search_cv.best_score_)

49341.00711571483

## Evaluate Best Model

In [84]:
print(np.sqrt(mse(regressor_search_cv.best_estimator_.predict(X_test), y_test)))

49427.55991871731


In [85]:
print(np.sqrt(mse(regressor_search_cv.best_estimator_.predict(X_train), y_train)))

18099.364163735652
