In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from src.helpers import load_data, helper_functions, visuals
from sklearn.model_selection import train_test_split
import matplotlib
import numpy as np

In [None]:
plt.rcParams.keys()

In [None]:
plt.rcParams["figure.figsize"] = (15, 8)
plt.style.use('ggplot')
plt.rcParams['axes.prop_cycle']=matplotlib.cycler(color=['#1f77b4', 'red'])

In [None]:
df_housing_raw = load_data.load_housing_raw_data()

In [None]:
project_path = helper_functions.get_project_path()

## Plan
1. splitting train-test
2. exploring data
3. data preparation pipeline (cleaning, imputing, feature engineering)
4. hyperparameter tuning
5. overfitting/underfitting check
6. evaluation on testing data

In [None]:
df_housing_raw.isna().sum()

In [None]:
visuals.plot_correlation(df_housing_raw)

In [None]:
df_housing_raw.hist(bins=50)
plt.savefig(project_path / "images" / "features_histogram.png")

In [None]:
df_housing = df_housing_raw.copy()

In [None]:
fig, ax = plt.subplots()
ax = pd.cut(df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf]).value_counts().sort_index().plot(kind='bar')
ax.set_title("Distribution of bins of 'median_income' feature")
plt.savefig(project_path / "images" / "dist_bin_median_income.png")

In [None]:
df_housing["median_income_bin"] = pd.cut(df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

In [None]:
df_train, df_test = train_test_split(
    df_housing,
    test_size=0.2,
    random_state=42,
    stratify=df_housing["median_income_bin"],
)