In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from src.helpers import load_data, helper_functions, visuals
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib
import plotly.express as px
import seaborn as sns
from sklearn.compose import (
    make_column_transformer,
    make_column_selector,
    ColumnTransformer,
)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    FunctionTransformer,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import joblib

In [None]:
plt.rcParams["figure.figsize"] = (15, 6)
plt.style.use("ggplot")
plt.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=["#1f77b4", "red"])

In [None]:
df_housing_raw = load_data.load_housing_raw_data()

In [None]:
project_path = helper_functions.get_project_path()

In [None]:
df_housing = df_housing_raw.copy()

In [None]:
df_housing["median_income_bin"] = pd.cut(
    df_housing["median_income"],
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
df_train, df_test = train_test_split(
    df_housing,
    test_size=0.2,
    random_state=42,
    stratify=df_housing["median_income_bin"],
)

# Feature engineering

In [None]:
df_train_X = df_train.drop("median_house_value", axis=1)
train_y = df_train["median_house_value"]

In [None]:
df_housing.isna().sum()

In [None]:
cat_preprocess = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)

In [None]:
num_preprocess = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
cat_num_preproc = ColumnTransformer(
    [
        ("cat", cat_preprocess, make_column_selector(dtype_include=object)),
        ("num", num_preprocess, make_column_selector(dtype_include=np.number)),
    ]
)

In [None]:
full_pipe = Pipeline(
    [("preprocessing", cat_num_preproc), ("model", RandomForestRegressor())]
)

In [None]:
full_pipe

In [None]:
full_pipe.get_params()

In [None]:
params_grid = {
    "preprocessing__num__simpleimputer": [
        SimpleImputer(strategy="mean"),
        SimpleImputer(strategy="median"),
        KNNImputer(),
    ],
    "preprocessing__cat__onehotencoder": [OneHotEncoder(), OrdinalEncoder()],
    "preprocessing__num__standardscaler": [StandardScaler(), RobustScaler()],
    "model": [RandomForestRegressor(), GradientBoostingRegressor()],
}

In [None]:
grid_search = GridSearchCV(full_pipe, params_grid, scoring="neg_root_mean_squared_error")

In [None]:
%%time
grid_search.fit(df_train_X, train_y)

In [None]:
grid_search.best_params_

In [None]:
results = pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score")
results = results[
    [
        "param_model",
        "param_preprocessing__cat__onehotencoder",
        "param_preprocessing__num__simpleimputer__strategy",
        "param_preprocessing__num__standardscaler",
        "param_preprocessing__num__standardscaler",
        "param_preprocessing__num__simpleimputer",
        "mean_test_score",
        "std_test_score",
        "rank_test_score",
    ]
]
results

In [None]:
results.to_excel(project_path / 'models' / 'tuning_results' / 'first_tuning' / 'first_tuning.xlsx')

In [None]:
best_model = joblib.dump(grid_search.best_estimator_)