# End-to-End ML Project: Pipeline

We're looking at the California Housing Prices (1990) dataset from the StatLib repository.

In [5]:
import pandas as pd
import numpy as np
import json 

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from download_datasets import fetch_housing_data, load_housing_data

from distance_to_ocean import DistanceToCoast
from custom_attributes import RoomsPerHouse, BedroomsPerHouse, PopulationPerHouse, CorrectColumns

# Preprocessing

In [6]:
# Import the data
housing = load_housing_data()

# Our `y` value (labels) is the column 'median_house_value'. We're going to dropna for that column
housing.dropna(subset=['median_house_value'], inplace=True)

# After we split out train/test, we're going to remove the 'median_house_value' column
# because those will be out labels. We take a look at the indices of the columns after 
# removing this, so we can select the correct columns in our transformers

cols = list(housing.columns)
cols.remove('median_house_value')
print(json.dumps({c: i for i, c in enumerate(cols)}, indent=2))

{
  "longitude": 0,
  "latitude": 1,
  "housing_median_age": 2,
  "total_rooms": 3,
  "total_bedrooms": 4,
  "population": 5,
  "households": 6,
  "median_income": 7,
  "ocean_proximity": 8
}


In [7]:
# We're going to create a preprocessing pipeline to add custom attrbutes, 
# impute missing values and scale the values

# Define these indices based on our inspection 
longitude_index = 0
latitude_index = 1
total_rooms_index = 3
total_bedrooms_index = 4
population_index = 5
households_index = 6

# we want to keep only certain columns (2, 6, 7), plus the 4 new ones we will add
keep_column_indices = (2, 6, 7, 9, 10, 11, 12)
	

preprocess_pipeline = Pipeline([
	("RoomsPerHouse", RoomsPerHouse(total_rooms_index, households_index)),
	("PopulationPerHouse", PopulationPerHouse(population_index, households_index)),
	("BedroomsPerHouse", BedroomsPerHouse(total_bedrooms_index, households_index)),
	("DistanceToCoast", DistanceToCoast(longitude_index, latitude_index)),
	("CorrectColumns", CorrectColumns(keep_column_indices)),
	("Impute", SimpleImputer(strategy="median")),
	("Scaler", StandardScaler())
])

In [8]:
# Using stratified sampling
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    train = housing.iloc[train_index]
    test = housing.iloc[test_index]
    
# now remove the `income_cat` field
train = train.drop(columns=["income_cat"]).to_numpy()
test = test.drop(columns=["income_cat"]).to_numpy()
housing.drop(columns=["income_cat"], inplace=True)

# now take the 'median_house_value' as the y value
y_indx = list(housing.columns).index('median_house_value')

y_train = train[:, y_indx]
y_test = test[:, y_indx]

# Fit & transform the X values
x_indx = tuple(set(range(len(housing.columns))) - {y_indx})

X_train = preprocess_pipeline.fit_transform(train[:, x_indx])
X_test = preprocess_pipeline.transform(test[:, x_indx])


# Model Selection

In [118]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [119]:
lr = LinearRegression()
lr_scores = cross_val_score(
    lr, 
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

dtr = DecisionTreeRegressor()
dtr_scores = cross_val_score(
    dtr, 
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

rfr = RandomForestRegressor()
rfr_scores = cross_val_score(
    rfr, 
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

svr = SVR()
svr_scores = cross_val_score(
    svr, 
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

In [120]:
pd.DataFrame(
    {
        "LinearRegression": np.sqrt(-lr_scores),
		"DecisionTreeRegressor": np.sqrt(-dtr_scores),
        "RandomForestRegressor": np.sqrt(-rfr_scores),
        "SVR": np.sqrt(-svr_scores)
	}
).describe()

Unnamed: 0,LinearRegression,DecisionTreeRegressor,RandomForestRegressor,SVR
count,10.0,10.0,10.0,10.0
mean,72271.851783,78252.743962,54975.108858,118319.9066
std,2232.091038,2134.678817,2296.59591,2751.892128
min,68812.256571,76094.426846,51591.240688,113665.132037
25%,70498.989956,76754.817516,53242.375865,116830.47851
50%,72829.726907,77278.795321,55489.071004,119065.983101
75%,74032.2663,79089.576905,56651.051712,119977.681927
max,75330.257206,82258.937409,58389.68039,122513.442403


# Model Tuning

In [123]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [131]:
param_grid = [
    {
        "bootstrap": [True],
        "n_estimators": [3, 10, 20, 30, 50],
        "max_features": [2, 4, 6, 8]
    },
    {
        "bootstrap": [False],
        "n_estimators": [3, 10, 20],
        "max_features": [2, 4, 6]
    }
]

rfr = RandomForestRegressor()

grid_search = GridSearchCV(
    rfr,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

grid_search.fit(X_train, y_train)

In [136]:
grid_search.cv

In [133]:
grid_search.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 50}

In [151]:
param_grid2 = [
    {
        "bootstrap": [True],
        "n_estimators": [40, 50, 60, 70, 80, 90, 100, 110],
        "max_features": [1, 2, 3]
    }
]

grid_search2 = GridSearchCV(
    rfr,
    param_grid=param_grid2,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

grid_search2.fit(X_train, y_train)

In [152]:
grid_search2.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 110}

# Testing

In [142]:
from sklearn.metrics import mean_squared_error

In [156]:
rfr = RandomForestRegressor(bootstrap=True, max_features=2, n_estimators=90)
rfr.fit(X_train, y_train)

predictions = rfr.predict(X_test)

print(np.sqrt(mean_squared_error(predictions, y_test)))

53129.870272181135
