## Feature Engineering EDA ##

In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load data
CLEAN_DATA_DIR = Path('../clean_data/')
df = pd.read_csv(CLEAN_DATA_DIR / 'cleaned_data.csv', index_col=0)

# Define targets and features
targets = ['FAIR Plan Units', '% FAIR Plan Units', 'Total Exposure ($)']
features = df.columns.difference(targets).tolist()

# Engineered features with descriptive names
independent_vars_engineered = {
    "Losses to Premium Ratio": df["Losses (Fire and Smoke) ($)"] / df["Earned Premium ($)"],
    "Renewed to Nonrenewed Ratio": df["Renewed Policies"] / df["Nonrenewed Policies"],
    "New and Renewed to Nonrenewed Ratio": (df["New Policies"] + df["Renewed Policies"]) / df["Nonrenewed Policies"],
    "New and Renewed to Total Units Ratio": (df["New Policies"] + df["Renewed Policies"]) / df["Total Res Units"],
    "Log of Average Median Income": np.log(df["Avg Median Income"]),
    "Housing Value to Median Income Ratio": df["Zillow Mean Home Value ($)"] / df["Avg Median Income"],
    "Housing Value to Log Median Income Ratio": df["Zillow Mean Home Value ($)"] / np.log(df["Avg Median Income"]),
    "Fire Smoke Claims to Total Units Ratio": df["Claims (Fire and Smoke)"] / df["Total Res Units"],
    "New and Renewed to Nonrenewed and Income Growth Product": ((df["New Policies"] + df["Renewed Policies"]) / df["Nonrenewed Policies"]) * df["% Change Median Income"],
    "New and Renewed to Nonrenewed and Income Growth Ratio": ((df["New Policies"] + df["Renewed Policies"]) / df["Nonrenewed Policies"]) / df["% Change Median Income"],
    "Losses to Company Nonrenewed Ratio": df["Losses (Fire and Smoke) ($)"] / df["Nonrenewed Policies (by Company)"],
    "Losses to Owner Nonrenewed Ratio": df["Losses (Fire and Smoke) ($)"] / df["Nonrenewed Policies (by Owner)"],
    "Change in Renewed to Income Growth Ratio": df["% Change - Renewed Policies"] / df["% Change Median Income"],
    "Change in Nonrenewed to Income Growth Ratio": (df["% Change - Nonrenewed Policies (by Owner)"] + df["% Change - Nonrenewed Policies (by Company)"]) / df["% Change Median Income"],
    "Growth in Losses to All Disasters 3yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["All Disasters 3y"],
    "Growth in Losses to All Disasters 5yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["All Disasters 5y"],
    "Growth in Losses to All Disasters 10yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["All Disasters 10y"],
    "Growth in Losses to Fire Disasters 3yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["Fire Disasters 3y"],
    "Growth in Losses to Fire Disasters 5yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["Fire Disasters 5y"],
    "Growth in Losses to Fire Disasters 10yr Ratio": df["% Change - Losses (Fire and Smoke)"] / df["Fire Disasters 10y"],
    "Premium to Exposure Ratio": df["Earned Premium ($)"] / df["Total Res Units"],
    "Nonrenewal Rate": df["Nonrenewed Policies"] / df["Expiring Policies"],
    "Loss Burden per Unit": df["Losses (Fire and Smoke) ($)"] / df["Total Res Units"],
    "Loss Burden per Exposure": df["Losses (Fire and Smoke) ($)"] / df["Total Res Units"],
    "Premium Adequacy": df["Earned Premium ($)"] / df["Losses (Fire and Smoke) ($)"],
    "Renewal Resilience": df["Renewed Policies"] / (df["Renewed Policies"] + df["Nonrenewed Policies"]),
}

# Combine all independent variables
independent_vars = {col: df[col] for col in features}
independent_vars.update(independent_vars_engineered)

# Check for missing or infinite values
for name, series in independent_vars_engineered.items():
    if series.isna().any() or np.isinf(series).any():
        print(f"⚠️ Warning: '{name}' contains missing or infinite values.")

# Correlation analysis
corrs = []
for name, x in independent_vars.items():
    try:
        corrs.append([
            name,
            *pearsonr(df[targets[0]], x),
            *pearsonr(df[targets[1]], x),
            *pearsonr(df[targets[2]], x)
        ])
    except Exception:
        continue

corrs_df = pd.DataFrame(corrs, columns=[
    "Feature", "Corr (FAIR)", "P (FAIR)",
    "Corr (% FAIR)", "P (% FAIR)",
    "Corr (Exposure)", "P (Exposure)"
])

# Feature importance
df_train, df_test = train_test_split(df, test_size=0.2, random_state=13348)
forest1 = RandomForestRegressor(n_estimators=500, max_depth=5).fit(df_train[features], df_train[targets[0]])
forest2 = RandomForestRegressor(n_estimators=500, max_depth=5).fit(df_train[features], df_train[targets[1]])
forest3 = RandomForestRegressor(n_estimators=500, max_depth=5).fit(df_train[features], df_train[targets[2]])

importance_df = pd.DataFrame({
    "Feature": features,
    "Importance (FAIR)": forest1.feature_importances_,
    "Importance (% FAIR)": forest2.feature_importances_,
    "Importance (Exposure)": forest3.feature_importances_
})

# Filter features based on substring overlap
def filter_by_dependency(df_sorted):
    selected = []
    for feat in df_sorted["Feature"]:
        if not any(feat in s and feat != s for s in selected):
            selected.append(feat)
    return df_sorted[df_sorted["Feature"].isin(selected)]

# Apply filtering independently for each target
filtered_corrs_fair = filter_by_dependency(corrs_df.sort_values("P (FAIR)"))
filtered_corrs_percent_fair = filter_by_dependency(corrs_df.sort_values("P (% FAIR)"))
filtered_corrs_exposure = filter_by_dependency(corrs_df.sort_values("P (Exposure)"))

filtered_importance_fair = filter_by_dependency(importance_df.sort_values("Importance (FAIR)", ascending=False))
filtered_importance_percent_fair = filter_by_dependency(importance_df.sort_values("Importance (% FAIR)", ascending=False))
filtered_importance_exposure = filter_by_dependency(importance_df.sort_values("Importance (Exposure)", ascending=False))

# Save results
filtered_corrs_fair.to_csv("correlation_results_filtered_fair.csv", index=False)
filtered_corrs_percent_fair.to_csv("correlation_results_filtered_percent_fair.csv", index=False)
filtered_corrs_exposure.to_csv("correlation_results_filtered_exposure.csv", index=False)

filtered_importance_fair.to_csv("feature_importance_filtered_fair.csv", index=False)
filtered_importance_percent_fair.to_csv("feature_importance_filtered_percent_fair.csv", index=False)
filtered_importance_exposure.to_csv("feature_importance_filtered_exposure.csv", index=False)

print("✅ Analysis complete. Filtered results saved to CSV files.")


✅ Analysis complete. Filtered results saved to CSV files.
