### import necessary libraries

In [1]:
import pandas as pd
import numpy as np

### load data 

In [4]:
df = pd.read_csv(r"C:\Users\Administrator\Downloads\week-3\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling\data\raw\MachineLearningRating_v3.txt", sep="|")

  df = pd.read_csv(r"C:\Users\Administrator\Downloads\week-3\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling\data\raw\MachineLearningRating_v3.txt", sep="|")


###  Basic cleaning and types

In [5]:
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')

### Create KPI columns

In [6]:
df['claim_occurred'] = np.where(df['TotalClaims'] > 0, 1, 0)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

### Standardize Zip code column name if necessary

In [8]:
if 'PostalCode' in df.columns:
    df['Zip'] = df['PostalCode'].astype(str)
else:
    df['Zip'] = df.get('Zip', df.index.astype(str))


### Quick summaries

In [9]:
print("Overall sample size:", len(df))
print("Overall claim frequency:", df['claim_occurred'].mean())
print("Overall average claim severity (conditional):",
      df.loc[df['claim_occurred'] == 1, 'TotalClaims'].mean())

Overall sample size: 1000098
Overall claim frequency: 0.002787726802773328
Overall average claim severity (conditional): 23273.387063228372


### Exploratory checks and table of descriptive evidence

In [10]:
# Descriptive statistics for key variables
desc = df[['TotalPremium','TotalClaims','Margin']].describe().T
desc['missing_count'] = df[['TotalPremium','TotalClaims','Margin']].isna().sum().values
print(desc)

# Claim frequency by group examples
freq_by_province = df.groupby('Province')['claim_occurred'].agg(['mean','count']).rename(columns={'mean':'claim_freq'})
freq_by_province['claim_freq_pct'] = freq_by_province['claim_freq'] * 100
print(freq_by_province.sort_values('claim_freq_pct', ascending=False).head(10))

# Claim severity by province (conditional on claim)
sev_by_province = df[df['claim_occurred'] == 1].groupby('Province')['TotalClaims'].agg(['mean','median','count'])
print(sev_by_province.sort_values('mean', ascending=False).head(10))


                  count       mean          std            min  25%       50%  \
TotalPremium  1000098.0  61.905496   230.284513    -782.576754  0.0  2.178333   
TotalClaims   1000098.0  64.861190  2384.074695  -12002.412281  0.0  0.000000   
Margin        1000098.0  -2.955694  2367.136547 -392848.566930  0.0  2.157687   

                    75%            max  missing_count  
TotalPremium  21.929825   65282.603421              0  
TotalClaims    0.000000  393092.105263              0  
Margin        21.929825   65282.603421              0  
               claim_freq   count  claim_freq_pct
Province                                         
Gauteng          0.003356  393865        0.335648
KwaZulu-Natal    0.002845  169781        0.284484
Limpopo          0.002698   24836        0.269770
North West       0.002436  143287        0.243567
Mpumalanga       0.002428   52718        0.242801
Western Cape     0.002166  170796        0.216633
Eastern Cape     0.001648   30336        0.164821
F

### Save these tables as CSV

In [13]:
freq_by_province.to_csv(r"C:\Users\Administrator\Downloads\week-3\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling\data\processed/freq_by_province.csv")
sev_by_province.to_csv(r"C:\Users\Administrator\Downloads\week-3\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling\data\processed/sev_by_province.csv")
desc.to_csv(r"C:\Users\Administrator\Downloads\week-3\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling\data\processed/desc_stats.csv")

### Hypothesis testing

## 1 — H₀: No risk differences across Provinces

### Claim Frequency (multiple groups)

#### Use chi-squared test of independence on contingency table Province x claim_occurred.

#### If counts are low, consider Fisher’s exact (rare with many groups) or combine low-count provinces.

In [14]:
from scipy.stats import chi2_contingency

cont = pd.crosstab(df['Province'], df['claim_occurred'])
chi2, p, dof, expected = chi2_contingency(cont)
print("Chi2:", chi2, "p-value:", p)
# interpret: p < 0.05 => reject H0 (differences in frequency by province)


Chi2: 104.19088107029361 p-value: 5.925510718204678e-19


## Claim Severity (continuous)

#### Use ANOVA to test mean severity differences across provinces (conditional on claim_occurred==1).

#### Use Kruskal-Wallis (non-parametric) if normality/variance assumptions fail.

In [15]:
from scipy.stats import f_oneway, kruskal

# Prepare list of arrays for provinces with enough observations
prov_groups = [group['TotalClaims'].values for name, group in df[df['claim_occurred']==1].groupby('Province') if len(group) >= 30]
prov_names = [name for name, group in df[df['claim_occurred']==1].groupby('Province') if len(group) >= 30]

fstat, p_anova = f_oneway(*prov_groups)
print("ANOVA F:", fstat, "p:", p_anova)

# If non-normal or unequal variances:
h, p_kw = kruskal(*prov_groups)
print("Kruskal-Wallis H:", h, "p:", p_kw)


ANOVA F: 6.207197517410859 p: 1.7443093629927493e-06
Kruskal-Wallis H: 105.49279871323623 p: 1.787923327856467e-20


### Effect size example for ANOVA

In [16]:
# compute eta-squared for ANOVA
ss_between = sum(len(g)*(g.mean()-df[df['claim_occurred']==1]['TotalClaims'].mean())**2 for g in prov_groups)
ss_total = sum(((g - df[df['claim_occurred']==1]['TotalClaims'].mean())**2).sum() for g in prov_groups)
eta2 = ss_between/ss_total
print("Eta-squared:", eta2)


Eta-squared: 0.013304734109893569


## 2 — H₀: No risk differences between Zip codes

### Zip codes → many categories. Strategy:

### Aggregate by Zip and require minimum sample size (e.g., >= 50 policies) to include zip.

### For Claim Frequency: use chi-squared across top-K zip codes or use logistic regression (Zip as factor).

### For Claim Severity: use ANOVA or pairwise t-tests with multiple testing correction.

#### Example: compare top N zip codes by volume:

In [17]:
zip_counts = df['Zip'].value_counts()
top_zips = zip_counts[zip_counts >= 50].index.tolist()

# Claim frequency table for top_zips
cont_zip = pd.crosstab(df.loc[df['Zip'].isin(top_zips),'Zip'], df['claim_occurred'])
chi2_zip, p_zip, _, _ = chi2_contingency(cont_zip)
print("Chi2 zip:", chi2_zip, "p:", p_zip)


Chi2 zip: 1427.6161592846206 p: 6.932017834632974e-36


### If multiple pairwise tests performed, apply Benjamini-Hochberg (FDR) or Bonferroni correction

## 3 — H₀: No significant risk difference between Men and Women
### Claim Frequency (chi-square test)

In [19]:
# Gender vs claim_occurred (frequency)
cont_gender = pd.crosstab(df['Gender'], df['claim_occurred'])
chi2_gender, p_gender, _, _ = chi2_contingency(cont_gender)
print("Chi2 Gender:", chi2_gender, "p:", p_gender)


Chi2 Gender: 7.255926312995721 p: 0.026570248768437145


## 4 — H₀: No significant margin difference between zip codes

### Margin is numerical, so we use a two-sample t-test.

### Strategy:

#### Identify high-risk zip codes and low-risk zip codes

#### Compare mean margin across both

In [20]:
# Compute zip-level claim frequency
zip_risk = df.groupby('Zip')['claim_occurred'].mean()


In [21]:
# Pick top 10 highest-risk and lowest-risk zips
high_risk_zips = zip_risk.sort_values(ascending=False).head(10).index
low_risk_zips  = zip_risk.sort_values(ascending=True).head(10).index


In [22]:
# Two-sample t-test
from scipy.stats import ttest_ind

margin_high = df[df['Zip'].isin(high_risk_zips)]['Margin']
margin_low  = df[df['Zip'].isin(low_risk_zips)]['Margin']

t_margin, p_margin = ttest_ind(margin_high, margin_low, equal_var=False)
print("Margin t-statistic:", t_margin, "p:", p_margin)


Margin t-statistic: -2.9311892344547306 p: 0.00345984417650611


## 5 — Multiple Testing Correction (Benjamini-Hochberg FDR)

### Collect all p-values:

In [23]:
p_values = [p, p_zip, p_gender, p_margin]  # province, zip, gender, margin
labels   = ["Province Risk", "Zip Risk", "Gender Risk", "Margin Difference"]


#### Convert to floats and apply BH correction:

In [24]:
from statsmodels.stats.multitest import multipletests

ps = [float(val) for val in p_values]

rej, p_adj, _, _ = multipletests(ps, alpha=0.05, method='fdr_bh')

print("\n=== Multiple Testing Correction (BH) ===")
for name, raw, adj, decision in zip(labels, p_values, p_adj, rej):
    print(f"{name:20s} | raw p={raw:.4e} | adj p={adj:.4e} | reject={decision}")



=== Multiple Testing Correction (BH) ===
Province Risk        | raw p=5.9255e-19 | adj p=1.1851e-18 | reject=True
Zip Risk             | raw p=6.9320e-36 | adj p=2.7728e-35 | reject=True
Gender Risk          | raw p=2.6570e-02 | adj p=2.6570e-02 | reject=True
Margin Difference    | raw p=3.4598e-03 | adj p=4.6131e-03 | reject=True


### 6 — Business Interpretation (Auto-Generated)

#### Add this at the end of your script for reporting:

In [25]:
print("\n\n=== BUSINESS INTERPRETATION ===")

if rej[0]:
    print("✔ We REJECT the hypothesis that provinces have equal risk.")
    print("  → Risk varies significantly across provinces. Premium adjustments may be needed.\n")
else:
    print("✘ We FAIL to reject the hypothesis for provinces.\n")

if rej[1]:
    print("✔ We REJECT the hypothesis that zip codes have equal risk.")
    print("  → Zip code is a strong geographic risk factor.\n")
else:
    print("✘ No significant evidence of zip-level risk differences.\n")

if rej[2]:
    print("✔ We REJECT the hypothesis that Men and Women have equal risk.")
    print("  → Gender shows statistically significant impact on claim frequency.\n")
else:
    print("✘ No evidence of gender-based risk differences.\n")

if rej[3]:
    print("✔ We REJECT the hypothesis that margin is equal across zip codes.")
    print("  → Profitability differs by location. High-risk zips reduce margins.\n")
else:
    print("✘ No significant margin difference found.\n")




=== BUSINESS INTERPRETATION ===
✔ We REJECT the hypothesis that provinces have equal risk.
  → Risk varies significantly across provinces. Premium adjustments may be needed.

✔ We REJECT the hypothesis that zip codes have equal risk.
  → Zip code is a strong geographic risk factor.

✔ We REJECT the hypothesis that Men and Women have equal risk.
  → Gender shows statistically significant impact on claim frequency.

✔ We REJECT the hypothesis that margin is equal across zip codes.
  → Profitability differs by location. High-risk zips reduce margins.

