In [None]:
# import Libraries
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
file_path = "MachineLearningRating_v3.txt"
df = pd.read_csv(file_path, sep="|")

In [None]:
#  Ensure required columns are present
required_cols = ['TotalClaims', 'TotalPremium', 'Province', 'PostalCode', 'Gender']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise KeyError(f"The following required columns are missing: {missing_cols}")

In [None]:
#  Rename 'PostalCode' to 'ZipCode' to standardize
df.rename(columns={'PostalCode': 'ZipCode'}, inplace=True)

In [None]:
#  Handle missing or assumed data
# Create NumberOfClaims if it doesn't exist — assume 1 claim if TotalClaims > 0
if 'NumberOfClaims' not in df.columns:
    df['NumberOfClaims'] = df['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# ➕ Create derived metrics
df['claim_flag'] = df['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)  # Binary flag
df['claim_severity'] = df.apply(lambda x: x['TotalClaims'] / x['NumberOfClaims'] if x['NumberOfClaims'] > 0 else np.nan, axis=1)
df['margin'] = df['TotalPremium'] - df['TotalClaims']

In [None]:
# Function to perform chi-squared test for claim frequency
def chi_square_test(group1, group2):
    claim1, n1 = group1['claim_flag'].sum(), len(group1)
    claim2, n2 = group2['claim_flag'].sum(), len(group2)
    contingency = np.array([[claim1, n1 - claim1], [claim2, n2 - claim2]])
    chi2, p, _, _ = stats.chi2_contingency(contingency)
    return p

In [None]:
#  Function to perform t-test for continuous variables
def t_test(group1, group2, column):
    data1, data2 = group1[column].dropna(), group2[column].dropna()
    t_stat, p = stats.ttest_ind(data1, data2, equal_var=False)
    return p

In [None]:
# Hypothesis 1: Risk differences across provinces (claim_flag)
print("1. Province Differences (Claim Frequency):")
provinces = df['Province'].dropna().unique()
for i in range(len(provinces)):
    for j in range(i+1, len(provinces)):
        g1 = df[df['Province'] == provinces[i]]
        g2 = df[df['Province'] == provinces[j]]
        p_val = chi_square_test(g1, g2)
        print(f"   {provinces[i]} vs {provinces[j]} --> p = {p_val:.4f} --> {'REJECTED' if p_val < 0.05 else 'NOT Rejected'}")

1. Province Differences (Claim Frequency):
   Gauteng vs KwaZulu-Natal --> p = 0.0020 --> REJECTED
   Gauteng vs Mpumalanga --> p = 0.0005 --> REJECTED
   Gauteng vs Eastern Cape --> p = 0.0000 --> REJECTED
   Gauteng vs Western Cape --> p = 0.0000 --> REJECTED
   Gauteng vs Limpopo --> p = 0.0902 --> NOT Rejected
   Gauteng vs North West --> p = 0.0000 --> REJECTED
   Gauteng vs Free State --> p = 0.0027 --> REJECTED
   Gauteng vs Northern Cape --> p = 0.0053 --> REJECTED
   KwaZulu-Natal vs Mpumalanga --> p = 0.1212 --> NOT Rejected
   KwaZulu-Natal vs Eastern Cape --> p = 0.0002 --> REJECTED
   KwaZulu-Natal vs Western Cape --> p = 0.0001 --> REJECTED
   KwaZulu-Natal vs Limpopo --> p = 0.7308 --> NOT Rejected
   KwaZulu-Natal vs North West --> p = 0.0292 --> REJECTED
   KwaZulu-Natal vs Free State --> p = 0.0175 --> REJECTED
   KwaZulu-Natal vs Northern Cape --> p = 0.0247 --> REJECTED
   Mpumalanga vs Eastern Cape --> p = 0.0237 --> REJECTED
   Mpumalanga vs Western Cape --> p = 0

In [None]:
# Hypothesis 2: Risk differences between zip codes (claim_flag, claim_severity)
print("\n2. Zip Code Differences:")
# Pick top 2 zip codes by frequency
top_zips = df['ZipCode'].value_counts().nlargest(2).index.tolist()
zip1, zip2 = top_zips[0], top_zips[1]
g1 = df[df['ZipCode'] == zip1]
g2 = df[df['ZipCode'] == zip2]

p_claim_freq = chi_square_test(g1, g2)
p_claim_sev = t_test(g1, g2, 'claim_severity')
print(f"   Claim Frequency ({zip1} vs {zip2}): p = {p_claim_freq:.4f} --> {'REJECTED' if p_claim_freq < 0.05 else 'NOT Rejected'}")
print(f"   Claim Severity  ({zip1} vs {zip2}): p = {p_claim_sev:.4f} --> {'REJECTED' if p_claim_sev < 0.05 else 'NOT Rejected'}")


2. Zip Code Differences:
   Claim Frequency (2000 vs 122): p = 0.0579 --> NOT Rejected
   Claim Severity  (2000 vs 122): p = 0.7002 --> NOT Rejected


In [None]:
# Hypothesis 3: Margin difference between zip codes
p_margin = t_test(g1, g2, 'margin')
print(f"   Margin Difference ({zip1} vs {zip2}): p = {p_margin:.4f} --> {'REJECTED' if p_margin < 0.05 else 'NOT Rejected'}")

   Margin Difference (2000 vs 122): p = 0.2445 --> NOT Rejected


In [None]:
# Hypothesis 4: Risk difference by gender (claim_flag, claim_severity)
print("\n3. Gender Differences:")
g_m = df[df['Gender'] == 'Male']
g_f = df[df['Gender'] == 'Female']

p_gender_freq = chi_square_test(g_m, g_f)
p_gender_sev = t_test(g_m, g_f, 'claim_severity')
print(f"   Claim Frequency (Male vs Female): p = {p_gender_freq:.4f} --> {'REJECTED' if p_gender_freq < 0.05 else 'NOT Rejected'}")
print(f"   Claim Severity  (Male vs Female): p = {p_gender_sev:.4f} --> {'REJECTED' if p_gender_sev < 0.05 else 'NOT Rejected'}")



3. Gender Differences:
   Claim Frequency (Male vs Female): p = 0.9515 --> NOT Rejected
   Claim Severity  (Male vs Female): p = 0.5680 --> NOT Rejected
