In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from src.helpers import load_data, helper_functions, visuals
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib
import plotly.express as px
import seaborn as sns
from sklearn.compose import (
    make_column_transformer,
    make_column_selector,
    ColumnTransformer,
)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    FunctionTransformer,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
plt.rcParams["figure.figsize"] = (15, 6)
plt.style.use("ggplot")
plt.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=["#1f77b4", "red"])

In [None]:
df_housing_raw = load_data.load_housing_raw_data()

In [None]:
project_path = helper_functions.get_project_path()

# Plan
1. splitting train-test
2. exploring data
3. data preparation pipeline (cleaning, imputing, feature engineering)
4. hyperparameter tuning
5. overfitting/underfitting check
6. evaluation on testing data

# Quick EDA to know how to stratify and split the data into train/test

In [None]:
df_housing_raw.isna().sum()

In [None]:
visuals.plot_correlation(df_housing_raw)

In [None]:
df_housing_raw.hist(bins=50)
plt.savefig(project_path / "images" / "features_histogram.png")

In [None]:
df_housing = df_housing_raw.copy()

In [None]:
fig, ax = plt.subplots()
pd.cut(
    df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf]
).value_counts().sort_index().plot(kind="bar", ax=ax)
ax.set_title("Distribution of bins of 'median_income' feature")
plt.savefig(project_path / "images" / "dist_bin_median_income.png")

# Split into train/test data

In [None]:
df_housing["median_income_bin"] = pd.cut(
    df_housing["median_income"],
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
df_train, df_test = train_test_split(
    df_housing,
    test_size=0.2,
    random_state=42,
    stratify=df_housing["median_income_bin"],
)

# More EDA (training set only)

In [None]:
# fig = px.scatter_geo(df_train, lat='latitude', lon='longitude', color='median_house_value', fitbounds='locations', size='median_house_value')
# fig.write_html(project_path / 'images/median_house_value_geospatial.html')

In [None]:
fig, ax = plt.subplots()
df_train.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    s=df_train["population"] / 50,
    c="median_house_value",
    cmap="jet",
    ax=ax,
    alpha=0.5,
    title="median_house_value geospatial distribution",
)
plt.savefig(project_path / "images/median_house_value_geospatial.png")

In [None]:
fig, ax = plt.subplots()
df_train.plot(
    kind="scatter",
    x="median_income",
    y="median_house_value",
    alpha=0.5,
    title="median_house_value in relation to median_income",
    ax=ax,
)
plt.savefig(project_path / "images/house_value_vs_income.png")

In [None]:
df_housing.ocean_proximity.value_counts().plot(kind="bar");

# Feature engineering

In [None]:
df_train_X = df_train.drop("median_house_value", axis=1)
train_y = df_train["median_house_value"]

In [None]:
df_housing.isna().sum()

In [None]:
cat_preprocess = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)

In [None]:
num_preprocess = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
cat_num_preproc = ColumnTransformer(
    [
        ("cat", cat_preprocess, make_column_selector(dtype_include=object)),
        ("num", num_preprocess, make_column_selector(dtype_include=np.number)),
    ]
)

In [None]:
full_pipe = Pipeline(
    [("preprocessing", cat_num_preproc), ("model", RandomForestRegressor())]
)

In [None]:
full_pipe

In [None]:
full_pipe.get_params()

In [None]:
params_grid = {
    "preprocessing__num__simpleimputer": [
        SimpleImputer(strategy="mean"),
        SimpleImputer(strategy="median"),
        KNNImputer(),
    ],
    "preprocessing__cat__onehotencoder": [OneHotEncoder(), OrdinalEncoder()],
    "preprocessing__num__standardscaler": [StandardScaler(), RobustScaler()],
    "model": [RandomForestRegressor(), GradientBoostingRegressor()],
}

In [None]:
grid_search = GridSearchCV(full_pipe, params_grid, scoring="neg_root_mean_squared_error")

In [None]:
%%time
grid_search.fit(df_train_X, train_y)

In [None]:
grid_search.best_params_

In [None]:
results = pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score")
results = results[
    [
        "param_model",
        "param_preprocessing__cat__onehotencoder",
        "param_preprocessing__num__simpleimputer",
        "param_preprocessing__num__standardscaler",
        "mean_test_score",
        "std_test_score",
        "rank_test_score",
    ]
]
results

In [None]:
final_model = grid_search.best_estimator_

In [None]:
X_test = df_test.drop('median_house_value', axis=1)
y_test = df_test['median_house_value']

In [None]:
final_predictions = final_model.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
final_rmse

In [None]:
results.to_excel(project_path / 'models' / 'tuning_results' / 'first_tuning' / 'first_tuning.xlsx', index=False)

In [None]:
best_model = joblib.dump(grid_search.best_estimator_, )