In [None]:
import joblib
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.compose import (ColumnTransformer, make_column_selector,
                             make_column_transformer)
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
                                   OrdinalEncoder, RobustScaler,
                                   StandardScaler)
from sklearn.base import BaseEstimator, TransformerMixin

from src.helpers import helper_functions, load_data, visuals

In [None]:
plt.rcParams["figure.figsize"] = (15, 6)
plt.style.use("ggplot")
plt.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=["#1f77b4", "red"])

In [None]:
df_housing_raw = load_data.load_housing_raw_data()

In [None]:
project_path = helper_functions.get_project_path()

# Plan
1. splitting train-test
2. exploring data
3. data preparation pipeline (cleaning, imputing, feature engineering)
4. hyperparameter tuning
5. overfitting/underfitting check
6. evaluation on testing data

# Quick EDA to know how to stratify and split the data into train/test

In [None]:
df_housing_raw.isna().sum()

In [None]:
visuals.plot_correlation(df_housing_raw)

In [None]:
df_housing_raw.hist(bins=50)
plt.savefig(project_path / "images" / "features_histogram.png")

In [None]:
df_housing = df_housing_raw.copy()

In [None]:
fig, ax = plt.subplots()
pd.cut(
    df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf]
).value_counts().sort_index().plot(kind="bar", ax=ax)
ax.set_title("Distribution of bins of 'median_income' feature")
plt.savefig(project_path / "images" / "dist_bin_median_income.png")

# Split into train/test data

In [None]:
df_housing["median_income_bin"] = pd.cut(
    df_housing["median_income"],
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
df_train, df_test = train_test_split(
    df_housing,
    test_size=0.2,
    random_state=42,
    stratify=df_housing["median_income_bin"],
)

# More EDA (training set only)

In [None]:
# fig = px.scatter_geo(df_train, lat='latitude', lon='longitude', color='median_house_value', fitbounds='locations', size='median_house_value')
# fig.write_html(project_path / 'images/median_house_value_geospatial.html')

In [None]:
fig, ax = plt.subplots()
df_train.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    s=df_train["population"] / 50,
    c="median_house_value",
    cmap="jet",
    ax=ax,
    alpha=0.5,
    title="median_house_value geospatial distribution",
)
plt.savefig(project_path / "images/median_house_value_geospatial.png")

In [None]:
fig, ax = plt.subplots()
df_train.plot(
    kind="scatter",
    x="median_income",
    y="median_house_value",
    alpha=0.5,
    title="median_house_value in relation to median_income",
    ax=ax,
)
plt.savefig(project_path / "images/house_value_vs_income.png")

In [None]:
df_housing.ocean_proximity.value_counts().plot(kind="bar");

To impute total_bedrooms, let's explore to see if we can impute smartly

In [None]:
df_housing.isna().sum()

In [None]:
# plot distribution of households with annotation of values for each bin
fig, ax = plt.subplots()
pd.cut(
    df_housing["households"], bins=[0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, np.inf]
).value_counts().sort_index().plot(kind="bar", ax=ax)
ax.set_title("Distribution of bins of 'households' feature")

In [None]:
# create a new column equal to total_bedrooms
df_housing['total_bedrooms_fill_nan'] = df_housing['total_bedrooms']

# create a new column with adequate bin households
df_housing['households_bin'] = pd.cut(df_housing['households'], bins=[0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, np.inf], labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

# fill missing values of total_bedrooms_fill_nan with the median of grouped households_bin
df_housing['total_bedrooms_fill_nan'] = df_housing.groupby('households_bin')['total_bedrooms'].transform(lambda x: x.fillna(x.median()))


In [None]:

# show the dataframe where total_bedrooms is missing
df_housing.loc[df_housing["total_bedrooms"].isna(), ["total_bedrooms", "households", "total_bedrooms_fill_nan"]].head(20)

In [None]:
map_test = df_housing.groupby("households_bin")["total_bedrooms"].mean()
map_test

In [None]:
df_housing["test_map"] = df_housing["households_bin"].map(map_test)
df_housing.loc[df_housing["test_map"].notna()]



In [None]:
a = df_housing["total_bedrooms"].map(map_test).reset_index()
a.loc[a["total_bedrooms"].notnull()]

In [None]:
# custom imputer in scikit-learn that fill missing values of a column with its median or mean by groups of households_bin


In [None]:
# custom imputer in scikit-learn that fill missing values with the median or mean of a grouped target column
class GroupedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, groupby, target, strategy="median"):
        self.groupby = groupby
        self.target = target
        self.strategy = strategy

    def fit(self, X, y=None):
        if self.strategy == "median":
            self.imputer = X.groupby(self.groupby)[self.target].median()
        elif self.strategy == "mean":
            self.imputer = X.groupby(self.groupby)[self.target].mean()
        return self

    def transform(self, X, y=None):
        X[self.target] = X[self.target].fillna(
            X[self.groupby].map(self.imputer)
        )
        return X

In [None]:
# custom imputer in scikit-learn that fill missing values of a column with its median or mean by groups of households_bin
class GroupedImputer(SimpleImputer):
    def __init__(self, groupby, target, strategy="median"):
        self.groupby = groupby
        self.target = target
        self.strategy = strategy

    def fit(self, X, y=None):
        self.imputer = SimpleImputer(strategy=self.strategy)
        self.imputer.fit(X.groupby(self.groupby)[self.target])
        return self

    def transform(self, X, y=None):
        X[self.target] = X.groupby(self.groupby)[self.target].transform(
            lambda x: x.fillna(self.imputer.statistics_[0])
        )
        return X
    

In [None]:
imputer_test = GroupedImputer(groupby="households_bin", target="total_bedrooms", strategy="median")
df_housing = imputer_test.fit_transform(df_housing)

In [None]:
# custom transformer in scikit-learn that creates n columns corresponding to the n kmeans clusters and calculates the distances to each cluster
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state

    def fit(self, X, y=None):
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans.fit(X)
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X["kmeans_cluster"] = self.kmeans.predict(X)
        X["kmeans_distance"] = self.kmeans.transform(X).min(axis=1)
        return X

In [None]:
# plot total_bedrooms grouped by households with adequate bins
fig, ax = plt.subplots()
df_housing.groupby("households")["total_bedrooms"].mean().plot(
    kind="hist", bins=50, ax=ax
)
ax.set_title("total_bedrooms grouped by households")
# plt.savefig(project_path / "images" / "total_bedrooms_grouped_by_households.png")


# Feature engineering

In [None]:
df_train_X = df_train.drop("median_house_value", axis=1)
train_y = df_train["median_house_value"]

In [None]:
cat_preprocess = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)

In [None]:
num_preprocess = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
cat_num_preproc = ColumnTransformer(
    [
        ("cat", cat_preprocess, make_column_selector(dtype_include=object)),
        ("num", num_preprocess, make_column_selector(dtype_include=np.number)),
    ]
)

In [None]:
full_pipe = Pipeline(
    [("preprocessing", cat_num_preproc), ("model", RandomForestRegressor())]
)

In [None]:
full_pipe

In [None]:
full_pipe.get_params()

In [None]:
params_grid = {
    "preprocessing__num__simpleimputer": [
        SimpleImputer(strategy="mean"),
        SimpleImputer(strategy="median"),
        KNNImputer(),
    ],
    "preprocessing__cat__onehotencoder": [OneHotEncoder(), OrdinalEncoder()],
    "preprocessing__num__standardscaler": [StandardScaler(), RobustScaler()],
    "model": [RandomForestRegressor(), GradientBoostingRegressor()],
}

In [None]:
grid_search = GridSearchCV(full_pipe, params_grid, scoring="neg_root_mean_squared_error")

In [None]:
%%time
grid_search.fit(df_train_X, train_y)

In [None]:
grid_search.best_params_

In [None]:
results = pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score")
results = results[
    [
        "param_model",
        "param_preprocessing__cat__onehotencoder",
        "param_preprocessing__num__simpleimputer",
        "param_preprocessing__num__standardscaler",
        "mean_test_score",
        "std_test_score",
        "rank_test_score",
    ]
]
results

In [None]:
final_model = grid_search.best_estimator_

In [None]:
X_test = df_test.drop('median_house_value', axis=1)
y_test = df_test['median_house_value']

In [None]:
final_predictions = final_model.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
final_rmse

In [None]:
results.to_excel(project_path / 'models' / 'tuning_results' / 'first_tuning' / 'first_tuning.xlsx', index=False)

In [None]:
best_model = joblib.dump(grid_search.best_estimator_, )