In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from src.helpers import load_data, helper_functions, visuals
from sklearn.model_selection import train_test_split
import matplotlib
import numpy as np
import plotly.express as px
import seaborn as sns

In [None]:
plt.rcParams["figure.figsize"] = (15, 8)
plt.style.use('ggplot')
plt.rcParams['axes.prop_cycle']=matplotlib.cycler(color=['#1f77b4', 'red'])

In [None]:
df_housing_raw = load_data.load_housing_raw_data()

In [None]:
project_path = helper_functions.get_project_path()

# Plan
1. splitting train-test
2. exploring data
3. data preparation pipeline (cleaning, imputing, feature engineering)
4. hyperparameter tuning
5. overfitting/underfitting check
6. evaluation on testing data

# Quick EDA to know how to stratify and split the data into train/test

In [None]:
df_housing_raw.isna().sum()

In [None]:
visuals.plot_correlation(df_housing_raw)

In [None]:
df_housing_raw.hist(bins=50)
plt.savefig(project_path / "images" / "features_histogram.png")

In [None]:
df_housing = df_housing_raw.copy()

In [None]:
fig, ax = plt.subplots()
pd.cut(df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf]).value_counts().sort_index().plot(kind='bar', ax=ax)
ax.set_title("Distribution of bins of 'median_income' feature")
plt.savefig(project_path / "images" / "dist_bin_median_income.png")

In [None]:
df_housing["median_income_bin"] = pd.cut(df_housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

# Split into train/test data

In [None]:
df_train, df_test = train_test_split(
    df_housing,
    test_size=0.2,
    random_state=42,
    stratify=df_housing["median_income_bin"],
)

# More EDA (training set only)

In [None]:
# fig = px.scatter_geo(df_train, lat='latitude', lon='longitude', color='median_house_value', fitbounds='locations', size='median_house_value')
# fig.write_html(project_path / 'images/median_house_value_geospatial.html')

In [None]:
fig, ax = plt.subplots()
df_train.plot(kind='scatter', x='longitude', y='latitude', s=df_train['population']/50, c='median_house_value', cmap='jet', ax=ax, alpha=0.5, title='median_house_value geospatial distribution')
plt.savefig(project_path / 'images/median_house_value_geospatial.png')

In [None]:
fig, ax = plt.subplots()
df_train.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.5, title='median_house_value in relation to median_income', ax=ax)
plt.savefig(project_path / 'images/house_value_vs_income.png')