In [1]:
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Load data
df = pd.read_csv(r"C:\10x AIMastery\Insurance-risk-analytics\SM\data\insurance.csv")


In [3]:
# Data Segmentation
# Check group equivalence (e.g., age distribution by region and sex)
print("Age Distribution by Region:\n", df.groupby('region')['age'].describe())
print("\nAge Distribution by Sex:\n", df.groupby('sex')['age'].describe())

Age Distribution by Region:
            count       mean        std   min    25%   50%   75%   max
region                                                               
northeast  324.0  39.268519  14.069007  18.0  27.00  39.5  51.0  64.0
northwest  325.0  39.196923  14.051646  19.0  26.00  39.0  51.0  64.0
southeast  364.0  38.939560  14.164585  18.0  26.75  39.0  51.0  64.0
southwest  325.0  39.455385  13.959886  19.0  27.00  39.0  51.0  64.0

Age Distribution by Sex:
         count       mean        std   min   25%   50%    75%   max
sex                                                               
female  662.0  39.503021  14.054223  18.0  27.0  40.0  51.75  64.0
male    676.0  38.917160  14.050141  18.0  26.0  39.0  51.00  64.0


In [4]:
# Hypothesis 1: No difference in mean charges across regions
# ANOVA test
regions = df['region'].unique()
charges_by_region = [df[df['region'] == r]['charges'] for r in regions]
f_stat, p_value_region = stats.f_oneway(*charges_by_region)
print(f"\nANOVA for Charges by Region: F={f_stat:.2f}, p-value={p_value_region:.4f}")
if p_value_region < 0.05:
    print("Reject H₀: Significant difference in charges across regions")
else:
    print("Fail to reject H₀: No significant difference in charges across regions")


ANOVA for Charges by Region: F=2.97, p-value=0.0309
Reject H₀: Significant difference in charges across regions


In [5]:
# Pairwise t-tests (for insight)
from itertools import combinations
for r1, r2 in combinations(regions, 2):
    t_stat, p_val = stats.ttest_ind(df[df['region'] == r1]['charges'], df[df['region'] == r2]['charges'])
    print(f"t-test {r1} vs {r2}: t={t_stat:.2f}, p-value={p_val:.4f}")

t-test southwest vs southeast: t=-2.43, p-value=0.0154
t-test southwest vs northwest: t=-0.08, p-value=0.9366
t-test southwest vs northeast: t=-1.18, p-value=0.2373
t-test southeast vs northwest: t=2.39, p-value=0.0169
t-test southeast vs northeast: t=1.36, p-value=0.1733
t-test northwest vs northeast: t=-1.13, p-value=0.2597


In [6]:
# Hypothesis 2: No difference in mean charges between sex
male_charges = df[df['sex'] == 'male']['charges']
female_charges = df[df['sex'] == 'female']['charges']
t_stat, p_value_sex = stats.ttest_ind(male_charges, female_charges)
print(f"\nt-test for Charges by Sex: t={t_stat:.2f}, p-value={p_value_sex:.4f}")
if p_value_sex < 0.05:
    print("Reject H₀: Significant difference in charges between male and female")
else:
    print("Fail to reject H₀: No significant difference in charges between male and female")


t-test for Charges by Sex: t=2.10, p-value=0.0361
Reject H₀: Significant difference in charges between male and female


In [10]:
# Visualization 1: Charges by Region (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='region', y='charges', data=df, palette='viridis')
plt.title('Charges Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.savefig(r'C:\10x AIMastery\Insurance-risk-analytics\figures\ab_test_region_charges.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='region', y='charges', data=df, palette='viridis')


In [12]:

# Visualization 2: Charges by Sex (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='sex', y='charges', data=df, palette='Set2')
plt.title('Charges Distribution by Sex')
plt.xlabel('Sex')
plt.ylabel('Charges')
plt.savefig(r'C:\10x AIMastery\Insurance-risk-analytics\figures\ab_test_sex_charges.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='sex', y='charges', data=df, palette='Set2')


In [14]:
# Visualization 3: Charges by Region and Smoker Status (Grouped Bar Plot)
plt.figure(figsize=(10, 6))
sns.barplot(x='region', y='charges', hue='smoker', data=df, palette='coolwarm')
plt.title('Average Charges by Region and Smoker Status')
plt.xlabel('Region')
plt.ylabel('Average Charges')
plt.savefig(r'C:\10x AIMastery\Insurance-risk-analytics\figures\ab_test_region_smoker.png')
plt.close()