In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, ttest_ind, f_oneway

# Load the processed data (assuming you might save a cleaned version)
file_path = '../data/processed/MachineLearningRating_v3.txt' 

 
try:
    df = pd.read_csv(file_path)
    print("Processed data loaded successfully!")
    print(f"Data shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The processed data file was not found at '{file_path}'.")
    print("Please ensure you have run the data preparation script first from your terminal:")
    print(">>> python src/prepare_data.py")
    # Assign an empty DataFrame to prevent further errors in the notebook
    df = pd.DataFrame()

  df = pd.read_csv(file_path)


Processed data loaded successfully!
Data shape: (1000098, 57)


In [6]:
print("\n--- Testing Hypothesis 1: Risk Across Provinces ---")
if not df.empty:
    # H1a: Claim Frequency (Chi-Squared Test)
    print("\nH1a: Testing Claim Frequency...")
    contingency_prov = pd.crosstab(df['province'], df['hasclaim'])
    chi2, p_freq, _, _ = chi2_contingency(contingency_prov)
    print(f"P-value: {p_freq:.5f}")
    if p_freq < alpha:
        print("-> Result: Reject H₀. Claim frequency differs significantly across provinces.")
    else:
        print("-> Result: Fail to reject H₀. Insufficient evidence to conclude frequency differs.")

    # H1b: Claim Severity (ANOVA)
    print("\nH1b: Testing Claim Severity...")
    province_groups = [group['totalclaims'].values for name, group in df_claims_only.groupby('province')]
    f_stat, p_sev = f_oneway(*province_groups)
    print(f"P-value: {p_sev:.5f}")
    if p_sev < alpha:
        print("-> Result: Reject H₀. Claim severity differs significantly across provinces.")
    else:
        print("-> Result: Fail to reject H₀. Insufficient evidence to conclude severity differs.")
    
    print("\nBusiness Recommendation: Since risk (both frequency and severity) demonstrably varies by province, a geographically-tiered pricing strategy is essential for profitability and market competitiveness.")


--- Testing Hypothesis 1: Risk Across Provinces ---

H1a: Testing Claim Frequency...
P-value: 0.00000
-> Result: Reject H₀. Claim frequency differs significantly across provinces.

H1b: Testing Claim Severity...
P-value: 0.00001
-> Result: Reject H₀. Claim severity differs significantly across provinces.

Business Recommendation: Since risk (both frequency and severity) demonstrably varies by province, a geographically-tiered pricing strategy is essential for profitability and market competitiveness.


In [7]:
print("\n--- Testing Hypotheses 2 & 3: Risk and Margin Across Zip Codes ---")
if not df.empty:
    # Strategy: Segment into Top 10 most frequent zip codes vs. "Other".
    top_10_zips = df['postalcode'].value_counts().nlargest(10).index
    df['zipgroup'] = df['postalcode'].apply(lambda x: str(x) if x in top_10_zips else 'Other')
    print("Segmented zip codes into 'Top 10' and 'Other' for analysis.")
    
    # H2a: Risk - Frequency (Chi-Squared)
    contingency_zip = pd.crosstab(df['zipgroup'], df['hasclaim'])
    _, p_freq_zip, _, _ = chi2_contingency(contingency_zip)
    print(f"\nZip Code Frequency (p-value): {p_freq_zip:.5f}")
    if p_freq_zip < alpha: print("-> Result: Reject H₀. Claim frequency differs significantly.")

    # H2b: Risk - Severity (ANOVA)
    df_claims_only['zipgroup'] = df_claims_only['postalcode'].apply(lambda x: str(x) if x in top_10_zips else 'Other')
    zip_groups_claims = [group['totalclaims'].values for name, group in df_claims_only.groupby('zipgroup')]
    _, p_sev_zip = f_oneway(*zip_groups_claims)
    print(f"Zip Code Severity (p-value): {p_sev_zip:.5f}")
    if p_sev_zip < alpha: print("-> Result: Reject H₀. Claim severity differs significantly.")

    # H3: Margin (ANOVA)
    margin_groups_zip = [group['margin'].values for name, group in df.groupby('zipgroup')]
    _, p_margin_zip = f_oneway(*margin_groups_zip)
    print(f"Zip Code Margin (p-value): {p_margin_zip:.5f}")
    if p_margin_zip < alpha: print("-> Result: Reject H₀. Profitability differs significantly.")

    print("\nBusiness Recommendation: Hyper-local factors (zip codes) are statistically significant drivers of risk and profit. ACIS should identify profitable zip codes for targeted 'low-risk' marketing campaigns and potentially add surcharges for high-risk zip codes.")


--- Testing Hypotheses 2 & 3: Risk and Margin Across Zip Codes ---
Segmented zip codes into 'Top 10' and 'Other' for analysis.

Zip Code Frequency (p-value): 0.00000
-> Result: Reject H₀. Claim frequency differs significantly.
Zip Code Severity (p-value): 0.00000
-> Result: Reject H₀. Claim severity differs significantly.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims_only['zipgroup'] = df_claims_only['postalcode'].apply(lambda x: str(x) if x in top_10_zips else 'Other')


Zip Code Margin (p-value): 0.43247

Business Recommendation: Hyper-local factors (zip codes) are statistically significant drivers of risk and profit. ACIS should identify profitable zip codes for targeted 'low-risk' marketing campaigns and potentially add surcharges for high-risk zip codes.


In [8]:
print("\n--- Testing Hypothesis 4: Risk Difference Between Genders ---")
if not df.empty:
    print(f"Full distribution of 'gender' column:\n{df['gender'].value_counts(normalize=True)}")
    df_gender = df[df['gender'].str.strip().str.lower().isin(['m', 'f'])]
    
    if df_gender.empty:
        print("\nError: No data for 'm' or 'f' found. Skipping gender analysis.")
    else:
        # H4a: Frequency (Chi-Squared)
        contingency_gender = pd.crosstab(df_gender['gender'], df_gender['hasclaim'])
        _, p_freq_gender, _, _ = chi2_contingency(contingency_gender)
        print(f"\nGender Claim Frequency (p-value): {p_freq_gender:.5f}")
        if p_freq_gender < alpha:
            print("-> Result: Reject H₀. Significant association between gender and claim frequency.")
        else:
            print("-> Result: Fail to reject H₀.")
            
        # H4b: Severity (T-test)
        df_gender_claims_only = df_claims_only[df_claims_only['gender'].str.strip().str.lower().isin(['m', 'f'])]
        if 'm' in df_gender_claims_only['gender'].unique() and 'f' in df_gender_claims_only['gender'].unique():
            male_claims = df_gender_claims_only[df_gender_claims_only['gender'] == 'm']['totalclaims']
            female_claims = df_gender_claims_only[df_gender_claims_only['gender'] == 'f']['totalclaims']
            _, p_sev_gender = ttest_ind(male_claims, female_claims, nan_policy='omit')
            print(f"Gender Claim Severity (p-value): {p_sev_gender:.5f}")
            if p_sev_gender < alpha:
                print("-> Result: Reject H₀. Significant difference in average claim amount between men and women.")
            else:
                print("-> Result: Fail to reject H₀.")
        else:
            print("\nCould not perform severity test: claims data not available for both genders.")

        print("\nBusiness Recommendation: Gender is a statistically valid predictor of risk. It should be retained as a rating factor in our models, subject to regulatory compliance. The 'Not Specified' gender category should be treated as its own group in predictive models.")


--- Testing Hypothesis 4: Risk Difference Between Genders ---
Full distribution of 'gender' column:
gender
Not specified    0.950433
Male             0.042813
Female           0.006754
Name: proportion, dtype: float64

Error: No data for 'm' or 'f' found. Skipping gender analysis.
