Hypothesis Testing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, norm
from statsmodels.stats.weightstats import ztest
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:

df = pd.read_csv('../data2/cleaned_data.csv', low_memory=False)

In [4]:
df.isnull().sum()

Unnamed: 0                  0
UnderwrittenCoverID         0
PolicyID                    0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
LegalType                   0
Title                       0
Language                    0
Bank                        0
AccountType                 0
MaritalStatus               0
Gender                      0
Country                     0
Province                    0
PostalCode                  0
MainCrestaZone              0
SubCrestaZone               0
ItemType                    0
mmcode                      0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity               0
kilowatts                   0
bodytype                    0
NumberOfDoors               0
VehicleIntroDate            0
CustomValueEstimate         0
AlarmImmobiliser            0
TrackingDevice              0
CapitalOut

In [5]:
# Display unique values in the 'Province' column
print(df['Province'].unique())


['Gauteng' 'KwaZulu-Natal' 'Mpumalanga' 'Eastern Cape' 'Western Cape'
 'Limpopo' 'North West' 'Free State' 'Northern Cape']


1. Selecting metrics

In [6]:
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']

2. Data Segmentation

In [7]:
# Hypothesis 1: Risk differences across provinces
provinces = df['Province'].unique()
group_a_prov = df[df['Province'] == provinces[0]]['TotalClaims']
group_b_prov = df[df['Province'] == provinces[1]]['TotalClaims']

In [8]:
# Hypothesis 2: Risk differences between zip codes
zipcodes = df['PostalCode'].unique()
group_a_zip = df[df['PostalCode'] == zipcodes[0]]['TotalClaims']
group_b_zip = df[df['PostalCode'] == zipcodes[1]]['TotalClaims']

In [9]:
# Hypothesis 3: Margin difference between zip codes (Numerical - Z-test)
group_a_margin = df[df['PostalCode'] == zipcodes[0]]['ProfitMargin']
group_b_margin = df[df['PostalCode'] == zipcodes[1]]['ProfitMargin']

In [10]:
# Hypothesis 4: Risk difference between Women and Men
group_a_gender = df[df['Gender'] == 'Female']['TotalClaims']
group_b_gender = df[df['Gender'] == 'Male']['TotalClaims']

3. Statistical Testing

In [11]:
# Z-Test for numerical data (e.g., TotalClaims and ProfitMargin)
z_stat_prov, p_val_prov = ztest(group_a_prov, group_b_prov, alternative='two-sided')
z_stat_zip, p_val_zip = ztest(group_a_zip, group_b_zip, alternative='two-sided')
z_stat_margin, p_val_margin = ztest(group_a_margin, group_b_margin, alternative='two-sided')
z_stat_gender, p_val_gender = ztest(group_a_gender, group_b_gender, alternative='two-sided')

In [12]:
# Chi-Squared Test for categorical data
# Hypothesis 1: Risk differences across provinces (Categorical - Chi-squared test)

contingency_table_prov = pd.crosstab(df['Province'], df['StatutoryRiskType'])  
chi2_stat_prov, p_val_chi2_prov, dof_prov, _ = chi2_contingency(contingency_table_prov)

In [13]:
# Hypothesis 2: Risk differences between zip codes (Categorical - Chi-squared test)

contingency_table_zip = pd.crosstab(df['PostalCode'], df['StatutoryRiskType'])  
chi2_stat_zip, p_val_chi2_zip, dof_zip, _ = chi2_contingency(contingency_table_zip)

4. Analyzing Results

In [30]:

print(f'Z-Test - Province Risk Difference: p-value = {p_val_prov}')
print(f'Z-Test - Zip Code Risk Difference: p-value = {p_val_zip}')
print(f'Z-Test - Margin Difference between Zip Codes: p-value = {p_val_margin}')
print(f'Z-Test - Risk Difference between Women and Men: p-value = {p_val_gender}')

print(f'Chi-Squared Test - Province Risk Difference: p-value = {p_val_chi2_prov}')
print(f'Chi-Squared Test - Zip Code Risk Difference: p-value = {p_val_chi2_zip}')

Z-Test - Province Risk Difference: p-value = 0.19282381899062995
Z-Test - Zip Code Risk Difference: p-value = nan
Z-Test - Margin Difference between Zip Codes: p-value = 0.6267219156496311
Z-Test - Risk Difference between Women and Men: p-value = 0.8041063687146345
Chi-Squared Test - Province Risk Difference: p-value = 1.0
Chi-Squared Test - Zip Code Risk Difference: p-value = 1.0


Interpretation of results

In [31]:
alpha = 0.05
if p_val_prov < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_prov:.4f}): There are risk differences across provinces.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_prov:.4f}): No significant risk differences across provinces.")



Fail to reject the null hypothesis (p-value: 0.1928): No significant risk differences across provinces.


In [32]:
alpha = 0.05
if p_val_zip < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_zip:.4f}): There are risk differences between zip codes.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_zip:.4f}): There are no risk differences between zip codes .")


Fail to reject the null hypothesis (p-value: nan): There are no risk differences between zip codes .


In [33]:
alpha = 0.05
if p_val_margin < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_margin:.4f}): There are significant margin (profit) difference between zip codes.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_margin:.4f}): There are no significant margin (profit) difference between zip codes.")


Fail to reject the null hypothesis (p-value: 0.6267): There are no significant margin (profit) difference between zip codes.


In [34]:
alpha = 0.05
if p_val_gender < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_gender:.4f}): There are significant risk difference between Women and Men.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_gender:.4f}): There are not significant risk difference between Women and Men.")


Fail to reject the null hypothesis (p-value: 0.8041): There are not significant risk difference between Women and Men.


In [35]:
alpha = 0.05
if p_val_chi2_prov < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_chi2_prov:.4f}): There are risk differences across provinces.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_chi2_prov:.4f}): No significant risk differences across provinces.")


Fail to reject the null hypothesis (p-value: 1.0000): No significant risk differences across provinces.


In [36]:
alpha = 0.05
if p_val_chi2_zip < alpha:
    print(f"Reject the null hypothesis (p-value: {p_val_chi2_zip:.4f}): There are risk differences between zip codes.")
else:
    print(f"Fail to reject the null hypothesis (p-value: {p_val_chi2_zip:.4f}): There are no risk differences between zip codes.")


Fail to reject the null hypothesis (p-value: 1.0000): There are no risk differences between zip codes.
