## Import Library

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import sys 
import os
sys.path.append('..')
from src.load import load_csv
import warnings
warnings.filterwarnings('ignore')


## LOAD CLEANED DATA

In [2]:
PATH = ('../data/clean_ML_rating.csv')
data = load_csv(PATH)
data.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
data.head(5)

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,Month
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,2015-03
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,2015-05
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,2015-07
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0,2015-05
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,2015-07


## CREATE METRICS

In [3]:
if "ClaimCount" not in data.columns:
    data['ClaimCount'] = (data['TotalClaims']>0).astype(int)
if 'PolicyCount' not in data.columns:
    data['PolicyCount'] = 1 

In [4]:
data['ClaimFrequency'] = data['ClaimCount']/data['PolicyCount']
data['ClaimSeverity'] = np.where(data["ClaimCount"] > 0,data['TotalClaims']/data['ClaimCount'],0)
data["Margin"] = data["TotalPremium"] - data["TotalClaims"]

### Helper function to test KPI differences

In [5]:
def run_anova_kpi(df, group_col, metric_col):
    """Runs ANOVA across multiple groups for a numerical KPI"""
    groups = [subset_group[metric_col].dropna() for name, subset_group in df.groupby(group_col)]
    if len(groups) < 2:
        return
    return stats.f_oneway(*groups).pvalue

In [6]:
def report_result(hypothesis, metric, p_value, alpha=0.05):
    decision = "REJECT H₀" if p_value < alpha else "FAIL TO REJECT H₀"
    print(f"{hypothesis} — {metric}: p-value = {p_value:.4f} → {decision}")

##  1. H₀: No risk differences across Provinces

In [7]:
print("------ HYPOTHESIS 1: Provinces ------")
p_freq = run_anova_kpi(data, 'Province', 'ClaimFrequency')
p_sev = run_anova_kpi(data[data['ClaimCount'] > 0], 'Province', 'ClaimSeverity')

report_result("Province Risk Differences", "Claim Frequency", p_freq)
report_result("Province Risk Differences", "Claim Severity", p_sev)

------ HYPOTHESIS 1: Provinces ------
Province Risk Differences — Claim Frequency: p-value = 0.0000 → REJECT H₀
Province Risk Differences — Claim Severity: p-value = 0.0000 → REJECT H₀


## 2. H₀: No risk differences between Postal Codes (Zip Codes)

In [8]:
zip_counts = data['PostalCode'].value_counts()
valid_zips = zip_counts[zip_counts > 50].index
zip_data = data[data['PostalCode'].isin(valid_zips)]
p_freq_zip = run_anova_kpi(zip_data, 'PostalCode', 'ClaimFrequency')
p_sev_zip = run_anova_kpi(zip_data[zip_data['ClaimCount'] > 0], 'PostalCode', 'ClaimSeverity')
print("\n------ HYPOTHESIS 2: Zip Codes ------")
report_result("Zip Code Risk Differences", "Claim Frequency", p_freq_zip)
report_result("Zip Code Risk Differences", "Claim Severity", p_sev_zip)


------ HYPOTHESIS 2: Zip Codes ------
Zip Code Risk Differences — Claim Frequency: p-value = 0.0000 → REJECT H₀
Zip Code Risk Differences — Claim Severity: p-value = 0.0266 → REJECT H₀


### HYPOTHESIS 3: There is no significant margin difference between zip codes

In [9]:

print("------ HYPOTHESIS 3: Zip Codes (Margin) ------")
p_margin_zip = run_anova_kpi(zip_data, 'PostalCode', 'Margin')
report_result("Zip Code Margin Differences", "Margin", p_margin_zip)

------ HYPOTHESIS 3: Zip Codes (Margin) ------
Zip Code Margin Differences — Margin: p-value = 0.9300 → FAIL TO REJECT H₀


### HYPOTHESIS 4:There is no significant risk difference between Women and Men

In [10]:
print("\n------ HYPOTHESIS 4: Gender ------")
male_freq = data[data["Gender"] == "Male"]["ClaimFrequency"].dropna()
female_freq = data[data["Gender"] == "Female"]["ClaimFrequency"].dropna()

p_gender = stats.ttest_ind(male_freq, female_freq, equal_var=False).pvalue
report_result("Gender Risk Differences", "Claim Frequency", p_gender)


------ HYPOTHESIS 4: Gender ------
Gender Risk Differences — Claim Frequency: p-value = 0.8372 → FAIL TO REJECT H₀


## A/B TESTING FRAMEWORK — Feature Impact on Risk Metrics

In [12]:
def ab_test(df, feature, group_a_val, group_b_val, kpi):
    A = df[df[feature] == group_a_val].copy()
    B = df[df[feature] == group_b_val].copy()
    
    print(f"\n======== A/B TEST ON {feature} ========")
    print(f"Group A ({group_a_val}): {len(A)} rows")
    print(f"Group B ({group_b_val}): {len(B)} rows\n")
    if np.issubdtype(df[kpi].dtype, np.number):
        # Numeric → t-test
        t_res = stats.ttest_ind(A[kpi].dropna(), B[kpi].dropna(), equal_var=False)
        p_value = t_res.pvalue
        test_name = "T-Test"
    else:
        # Categorical → Chi-Squared
        table = pd.crosstab(df[feature], df[kpi])
        chi2, p_value, _, _ = stats.chi2_contingency(table)
        test_name = "Chi-Squared"
    
    print(f"KPI Test ({test_name}) — {kpi}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("➡ Significant difference — Feature impacts KPI")
    else:
        print("➡ No significant difference — Feature does NOT impact KPI")
    # ---- 2. Balance Check: Are groups similar? ----
    print("**Balance Check on Other Attributes:**")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = [c for c in numeric_cols if c not in [kpi, feature,"ClaimSeverity"]]

    for col in numeric_cols:
        a_vals = A[col].dropna()
        b_vals = B[col].dropna()

        if len(a_vals) < 10 or len(b_vals) < 10:
            continue  # skip small sample columns

        t = stats.ttest_ind(a_vals, b_vals, equal_var=False)
        print(f"{col}: p-value = {t.pvalue:.4f}")

    print("======================================\n")


#### Checking A/B Test shows not significant risk difference between Women and Men

In [13]:
p_gender = ab_test(data, 'Gender', 'Male', 'Female', 'ClaimFrequency')
p_gender


Group A (Male): 42817 rows
Group B (Female): 6755 rows

KPI Test (T-Test) — ClaimFrequency: p-value = 0.8372
➡ No significant difference — Feature does NOT impact KPI
**Balance Check on Other Attributes:**
UnderwrittenCoverID: p-value = 0.0000
PolicyID: p-value = 0.0000
PostalCode: p-value = 0.0000
mmcode: p-value = 0.0000
RegistrationYear: p-value = 0.0105
Cylinders: p-value = 0.0000
cubiccapacity: p-value = 0.0000
kilowatts: p-value = 0.0000
NumberOfDoors: p-value = 0.0000
CustomValueEstimate: p-value = 0.2023
SumInsured: p-value = 0.1914
CalculatedPremiumPerTerm: p-value = 0.0916
TotalPremium: p-value = 0.0000
TotalClaims: p-value = 0.7670
ClaimCount: p-value = 0.8372
PolicyCount: p-value = nan
Margin: p-value = 0.8015



#### Checking A/B Test shows significant risk difference between region

In [14]:
p_region = ab_test(data, feature='Province', group_a_val='Gauteng', group_b_val='Western Cape', kpi='ClaimFrequency')
p_region


Group A (Gauteng): 393865 rows
Group B (Western Cape): 170796 rows

KPI Test (T-Test) — ClaimFrequency: p-value = 0.0000
➡ Significant difference — Feature impacts KPI
**Balance Check on Other Attributes:**
UnderwrittenCoverID: p-value = 0.0000
PolicyID: p-value = 0.0000
PostalCode: p-value = 0.0000
mmcode: p-value = 0.0000
RegistrationYear: p-value = 0.0000
Cylinders: p-value = 0.0000
cubiccapacity: p-value = 0.0000
kilowatts: p-value = 0.0000
NumberOfDoors: p-value = 0.0000
CustomValueEstimate: p-value = 0.0004
SumInsured: p-value = 0.0797
CalculatedPremiumPerTerm: p-value = 0.0171
TotalPremium: p-value = 0.0007
TotalClaims: p-value = 0.0622
ClaimCount: p-value = 0.0000
PolicyCount: p-value = nan
Margin: p-value = 0.1636

