## Import Library

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import sys 
import os
sys.path.append('..')
from src.load import load_csv
import warnings
warnings.filterwarnings('ignore')


## LOAD CLEANED DATA

In [3]:
PATH = ('../data/clean_ML_rating.csv')
data = load_csv(PATH)
data.head(5)

Unnamed: 0.1,Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,...,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,Month
0,0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,2015-03
1,1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,2015-05
2,2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,2015-07
3,3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0,2015-05
4,4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,2015-07


## CREATE METRICS

In [4]:
if "ClaimCount" not in data.columns:
    data['ClaimCount'] = (data['TotalClaims']>0).astype(int)
if 'PolicyCount' not in data.columns:
    data['PolicyCount'] = 1 

In [5]:
data['ClaimFrequency'] = data['ClaimCount']/data['PolicyCount']
data['ClaimSeverity'] = np.where(data["ClaimCount"] > 0,data['TotalClaims']/data['ClaimCount'],0)
data["Margin"] = data["TotalPremium"] - data["TotalClaims"]

## Remove rows where ClaimSeverity = 0 before severity tests

In [6]:
severity_data = data[data["ClaimCount"] > 0].copy()

## HELPER FUNCTION FOR RESULTS

In [None]:
def display_test_result(name, p_value):
    if p_value < 0.05:
        decision = "REJECT H₀ — Significant difference detected."
    else:
        decision = "FAIL TO REJECT H₀ — No significant difference detected."
    print(f"{name}: p-value = {p_value:.4f} → {decision}")
def perform_anova(df, group_col, metric_col):
    groups = [subset_group[metric_col].dropna() for name, subset_group in df.groupby(group_col)]
    return stats.f_oneway(*groups)

##  1. H₀: No risk differences across Provinces

In [14]:
print("------ HYPOTHESIS 1: Provinces ------")
anova_freq = perform_anova(data, "Province", "ClaimFrequency")
display_test_result("Claim Frequency ANOVA", anova_freq.pvalue)

anova_sev = perform_anova(severity_data, "Province", "ClaimSeverity")
display_test_result("Claim Severity ANOVA", anova_sev.pvalue)
print()

------ HYPOTHESIS 1: Provinces ------
Claim Frequency ANOVA: p-value = 0.0000 → REJECT H₀ — Significant difference detected.
Claim Severity ANOVA: p-value = 0.0000 → REJECT H₀ — Significant difference detected.



## 2. H₀: No risk differences between Postal Codes (Zip Codes)

In [15]:
zip_counts = data['PostalCode'].value_counts()
valid_zips = zip_counts[zip_counts > 50].index
zip_data = data[data['PostalCode'].isin(valid_zips)]
zip_sev_data = severity_data[severity_data['PostalCode'].isin(valid_zips)]
# Claim Frequency ANOVA
anova_zip_freq = perform_anova(zip_data, "PostalCode", "ClaimFrequency")
display_test_result("Claim Frequency ANOVA", anova_zip_freq.pvalue)

# Claim Severity ANOVA
anova_zip_sev = perform_anova(zip_sev_data, "PostalCode", "ClaimSeverity")
display_test_result("Claim Severity ANOVA", anova_zip_sev.pvalue)
print()

Claim Frequency ANOVA: p-value = 0.0000 → REJECT H₀ — Significant difference detected.
Claim Severity ANOVA: p-value = 0.0266 → REJECT H₀ — Significant difference detected.



### HYPOTHESIS 3: There is no significant margin difference between zip codes

In [16]:

print("------ HYPOTHESIS 3: Zip Codes (Margin) ------")
anova_zip_margin = perform_anova(zip_data, "PostalCode", "Margin")
display_test_result("Margin ANOVA", anova_zip_margin.pvalue)
print()

------ HYPOTHESIS 3: Zip Codes (Margin) ------
Margin ANOVA: p-value = 0.9300 → FAIL TO REJECT H₀ — No significant difference detected.



### HYPOTHESIS 4:There is no significant risk difference between Women and Men

In [17]:
print("------ HYPOTHESIS 4: Gender ------")
male_freq = data[data["Gender"] == "Male"]["ClaimFrequency"].dropna()
female_freq = data[data["Gender"] == "Female"]["ClaimFrequency"].dropna()

t_gender = stats.ttest_ind(male_freq, female_freq, equal_var=False)
display_test_result("Male vs Female Claim Frequency t-test", t_gender.pvalue)

------ HYPOTHESIS 4: Gender ------
Male vs Female Claim Frequency t-test: p-value = 0.8372 → FAIL TO REJECT H₀ — No significant difference detected.


                    Hypothesis                  Test Applied  \
0    Province Risk Differences  ANOVA (Frequency + Severity)   
1    Zip Code Risk Differences  ANOVA (Frequency + Severity)   
2  Zip Code Margin Differences                ANOVA (Margin)   
3      Gender Risk Differences            T-test (Frequency)   

                                Conclusion  
0  Reject or Fail based on printed results  
1                           Reject or Fail  
2                           Reject or Fail  
3                           Reject or Fail  
