In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_target_distribution(df: pd.DataFrame, col: str, save_path: Path):
    """
    Plots the histogram and KDE of the target variable.
    """
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], bins=50, kde=True)
    plt.title(f'Target Distribution: {col}')
    plt.xlabel(f'{col} ($100k)')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.show()

def plot_feature_scatter(df: pd.DataFrame, target: str, features: list, save_dir: Path):
    """
    Plots scatter diagrams of selected features against the target.
    """
    for col in features:
        plt.figure(figsize=(7,5))
        sns.scatterplot(data=df, x=col, y=target, alpha=0.3)
        plt.title(f'{target} vs {col}')
        plt.tight_layout()
        plt.savefig(save_dir / f'02_scatter_{col}.png', dpi=150)
        plt.show()

def plot_correlation_heatmap(df: pd.DataFrame, save_path: Path):
    """
    Plots a correlation heatmap for all numeric variables.
    """
    plt.figure(figsize=(10,8))
    corr = df.corr(numeric_only=True)
    sns.heatmap(corr, cmap='coolwarm', center=0, annot=False)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.show()

# --- Execution ---
raw_csv_path = Path('../data/raw/california_housing.csv').resolve()
results_dir = Path('../results').resolve()
results_dir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(raw_csv_path)

# --- 1. Target Distribution ---
plot_target_distribution(df, 'MedHouseVal', results_dir / '01_target_distribution.png')

# **Analysis of Target Distribution:**
# The distribution of `MedHouseVal` is somewhat right-skewed. However, there is a noticeable spike at the maximum value (5.0 or $500k), indicating that the data was likely capped at this threshold during collection. This "capped" value acts as an anomaly that may affect regression models.

# --- 2. Scatter Plots ---
features = ['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup']
plot_feature_scatter(df, 'MedHouseVal', features, results_dir)

# **Analysis of Feature Relationships:**
# - **MedInc:** Shows the strongest positive linear relationship with house value.
# - **AveOccup:** Shows a slight negative trend; higher occupancy often correlates with lower house values.
# - **outliers:** We observe some extreme outliers in `AveRooms` and `Population` which should be handled during preprocessing.

# --- 3. Boxplot Analysis ---
df['LatBand'] = pd.cut(df['Latitude'], bins=6)
plt.figure(figsize=(10,5))
sns.boxplot(data=df, x='LatBand', y='MedHouseVal')
plt.title('MedHouseVal by Latitude Band')
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(results_dir / '03_boxplot_latband.png', dpi=150)
plt.show()

# **Analysis of Categorical Patterns:**
# By discretizing Latitude, we can see price variance across geographic bands. Certain latitude bands correspond to high-density areas (like the Bay Area or LA), showing significantly higher median prices and wider interquartile ranges.

# --- 4. Correlation Heatmap ---
plot_correlation_heatmap(df, results_dir / '04_correlation_heatmap.png')

# **Correlation Insights:**
# The heatmap confirms `MedInc` (0.68) is the most predictive feature. There is also a high correlation between `AveRooms` and `AveBedrms`, suggesting multicollinearity that we reduced via our feature engineering (`BedroomsPerRoom`).

# --- 5. Pairplot ---
sns.pairplot(df[['MedHouseVal', 'MedInc', 'HouseAge', 'AveRooms']], diag_kind='kde')
plt.savefig(results_dir / '05_pairplot_subset.png', dpi=150)
plt.show()

print('Saved figures to:', results_dir)