# Task-3
Statistically validate or reject key hypotheses about risk drivers, which will form the basis of our new segmentation strategy.

In [1]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import os 
import sys
sys.path.append(os.path.abspath("../scripts"))
import warnings
warnings.filterwarnings('ignore')

In [2]:
# %%
os.chdir("..")  # Go up a directory
print(os.getcwd())

c:\Users\Belay\End-to-End-Insurance-Risk-Analytics-Predictive-Modeling


In [3]:
# Set chunk size (number of rows to read at a time)
chunk_size = 100000  

# Set the separator used in your data.txt file (update if needed)
sep = '|'  # for example, '|' or '\t' or ','

# Create an empty list to collect processed chunks (optional)
chunks = []

# Iterate over chunks of the file
for chunk in pd.read_csv("Data/MachineLearningRating_v3.txt", sep=sep, chunksize=chunk_size, engine='python', low_memory=True):
    # Example: you can do filtering or processing on each chunk here
    # For example, convert date column to datetime
    chunk['TransactionMonth'] = pd.to_datetime(chunk['TransactionMonth'], errors='coerce')
    
    # Example: filter rows if you want, e.g., only claims > 0
    # chunk = chunk[chunk['TotalClaims'].astype(float) > 0]
    
    # Store or process chunk - here we just append to list
    chunks.append(chunk)

# Combine all chunks (only if you have enough RAM)
df = pd.concat(chunks, ignore_index=True)

In [5]:
print(df.columns.tolist())

['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']


In [6]:
import numpy as np
# Binary flag: Did the policy have any claim?
df['ClaimFlag'] = np.where(df['TotalClaims'] > 0, 1, 0)

# Claim Severity (avoid division by zero)
df['ClaimSeverity'] = df.apply(lambda row: row['TotalClaims'] / row['ClaimFlag'] if row['ClaimFlag'] > 0 else 0, axis=1)

# Margin
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

A/B Hypothesis Testing

In [7]:
provinces = df['Province'].unique()
print(provinces)

['Gauteng' 'KwaZulu-Natal' 'Mpumalanga' 'Eastern Cape' 'Western Cape'
 'Limpopo' 'North West' 'Free State' 'Northern Cape']


In [8]:
group_a = df[df['Province'] == 'Gauteng']
group_b = df[df['Province'] == 'Limpopo']

Statistical Testing

In [9]:
# a) Claim Frequency (categorical proportion)
# Use Chi-square test or two-proportion z-test:
from statsmodels.stats.proportion import proportions_ztest

count = np.array([group_a['ClaimFlag'].sum(), group_b['ClaimFlag'].sum()])
nobs = np.array([len(group_a), len(group_b)])

stat, pval = proportions_ztest(count, nobs)
print(f'Claim Frequency test p-value: {pval}')
if pval < 0.05:
    print("Reject null hypothesis → significant difference")
else:
    print("Accept null hypothesis → there is no significant difference")

Claim Frequency test p-value: 0.07991724614845505
Accept null hypothesis → there is no significant difference


In [10]:
# b) Claim Severity (continuous data, given claim occurred)
# Use t-test to compare means (only for policies with claims):
from scipy.stats import ttest_ind

# Filter policies where claims occurred
claims_df = df[df['TotalClaims'] > 0]

# Calculate Claim Severity = average claim amount for those policies
claim_severity = claims_df['TotalClaims'].mean()

# print("Claim Severity:", claim_severity)


severity_a = group_a[group_a['ClaimFlag'] == 1]['ClaimSeverity']
severity_b = group_b[group_b['ClaimFlag'] == 1]['ClaimSeverity']

stat, pval = ttest_ind(severity_a, severity_b, equal_var=False, nan_policy='omit')
print(f'Claim Severity test p-value: {pval}')

Claim Severity test p-value: 0.047887459586952126


In [11]:
# c) Margin (continuous variable)
# Use t-test to compare margins:

argin_a = group_a['Margin']
margin_b = group_b['Margin']

stat, pval = ttest_ind(argin_a, margin_b, equal_var=False, nan_policy='omit')
print(f'Margin test p-value: {pval}')

Margin test p-value: 0.0017399147061173148


For Claim Frequency: use Chi-square test of independence on contingency table.

In [12]:
import scipy.stats as stats

contingency_table = pd.crosstab(df['Province'], df['ClaimFlag'])
chi2, pval, dof, expected = stats.chi2_contingency(contingency_table)
print(f'Chi-square test for claim frequency across provinces p-value: {pval}')

Chi-square test for claim frequency across provinces p-value: 5.925510718204678e-19


In [13]:
# For continuous variables (Claim Severity, Margin): use ANOVA
from scipy.stats import f_oneway

groups = [df[df['Province'] == prov]['ClaimSeverity'].dropna() for prov in provinces]
f_stat, pval = f_oneway(*groups)
print(f'ANOVA for Claim Severity across provinces p-value: {pval}')

ANOVA for Claim Severity across provinces p-value: 1.6569173744506565e-07


Hypothesis Testing Summary : Interpretation & Business Recommendation

In [14]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind, chi2_contingency, f_oneway

# Assume df prepared with ClaimFlag, ClaimSeverity, Margin columns

# Chi-square test for claim frequency across all provinces
contingency_table = pd.crosstab(df['Province'], df['ClaimFlag'])
chi2, pval, dof, expected = chi2_contingency(contingency_table)
print(f'Claim Frequency by Province: p-value = {pval}')

# ANOVA for claim severity across provinces
provinces = df['Province'].unique()
groups = [df[df['Province'] == prov]['ClaimSeverity'].dropna() for prov in provinces]
f_stat, pval = f_oneway(*groups)
print(f'Claim Severity by Province: p-value = {pval}')

# ANOVA for margin across provinces
groups_margin = [df[df['Province'] == prov]['Margin'].dropna() for prov in provinces]
f_stat, pval = f_oneway(*groups_margin)
print(f'Margin by Province: p-value = {pval}')

Claim Frequency by Province: p-value = 5.925510718204678e-19
Claim Severity by Province: p-value = 1.6569173744506565e-07
Margin by Province: p-value = 0.0011450081247589
