In [None]:
# notebooks/hypothesis_testing/postalcode_vs_margin.ipynb
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------
# Setup paths
# ---------------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)
data_path = os.path.join(project_root, "data", "processed", "insurance_data_cleaned.csv")

# ---------------------------
# Load data
# ---------------------------
df = pd.read_csv(data_path)

# ---------------------------
# Import src functions
# ---------------------------
from src.hypothesis_testing.segmentation import segment_two_groups
from src.hypothesis_testing.metrics import calculate_margin
from src.hypothesis_testing.statistical_tests import t_test_numeric, proportion_test
from src.hypothesis_testing.reporting import interpret_p_value

# ---------------------------
# Select two postal codes for comparison
# ---------------------------
postal_a, postal_b = segment_two_groups(df, "PostalCode", "81", "4")

# ---------------------------
# Margin test
# ---------------------------
margin_a = calculate_margin(postal_a)
margin_b = calculate_margin(postal_b)

t_stat, p_val = t_test_numeric(margin_a, margin_b)
print(f"Margin (mean difference): {interpret_p_value(p_val)}")

# ---------------------------
# Visualizations
# ---------------------------
plt.figure(figsize=(8,5))
sns.boxplot(x="PostalCode", y=margin_a.append(margin_b), data=pd.DataFrame({
    "PostalCode": ["81"]*len(margin_a) + ["4"]*len(margin_b),
    "Margin": margin_a.append(margin_b)
}))
plt.title("Margin by PostalCode")
plt.show()
