In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv(r"C:\10x AIMastery\Insurance-risk-analytics\SM\data\insurance.csv")

print(df.head())
print(df.info())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [4]:
# --- Data Summarization ---
# Descriptive Statistics: Calculate mean, median, std, min, max, and quartiles for numerical features
numerical_cols = ['age', 'bmi', 'children', 'charges']
desc_stats = df[numerical_cols].describe()
print("Descriptive Statistics:\n", desc_stats)

# Calculate variability (coefficient of variation = std/mean)
variability = df[numerical_cols].std() / df[numerical_cols].mean()
print("Coefficient of Variation:\n", variability)

# Data Structure: Verify data types
print("\nData Types:\n", df.dtypes)

Descriptive Statistics:
                age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010
Coefficient of Variation:
 age         0.358353
bmi         0.198875
children    1.100989
charges     0.912557
dtype: float64

Data Types:
 age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [5]:
# --- Data Quality Assessment ---
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values)
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("Missing Percentage:\n", missing_percentage)


Missing Values:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
Missing Percentage:
 age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64


In [6]:
# Handle missing values
df['charges'] = df['charges'].fillna(df['charges'].median())
df['sex'] = df['sex'].fillna(df['sex'].mode()[0])
df['smoker'] = df['smoker'].fillna(df['smoker'].mode()[0])
df['region'] = df['region'].fillna(df['region'].mode()[0])

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()



Number of duplicate rows: 1


In [8]:
import os

# Create figures directory if it doesn't exist
os.makedirs('figures', exist_ok=True)

# Univariate Analysis
# Numerical Columns: Histograms with KDE
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.savefig('figures/univariate_numerical.png')
plt.close()

# Categorical Columns: Bar charts
categorical_cols = ['sex', 'smoker', 'region']
plt.figure(figsize=(12, 8))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(2, 2, i)
    sns.countplot(data=df, x=col)
    plt.title(f'Count of {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/univariate_categorical.png')
plt.close()

In [9]:
# Bivariate/Multivariate Analysis
# Average charges by region, sex, and smoker status
charges_by_region = df.groupby('region')['charges'].mean()
charges_by_sex = df.groupby('sex')['charges'].mean()
charges_by_smoker = df.groupby('smoker')['charges'].mean()
print("\nAverage Charges by Region:\n", charges_by_region)
print("Average Charges by Sex:\n", charges_by_sex)
print("Average Charges by Smoker:\n", charges_by_smoker)

# Correlation matrix for numerical variables
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.savefig('figures/correlation_matrix.png')
plt.close()

# Scatter plot for charges vs. bmi by region
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='bmi', y='charges', hue='region', alpha=0.6)
plt.title('Charges vs BMI by Region')
plt.savefig('figures/scatter_bmi_charges.png')
plt.close()


Average Charges by Region:
 region
northeast    13406.384516
northwest    12450.840844
southeast    14735.411438
southwest    12346.937377
Name: charges, dtype: float64
Average Charges by Sex:
 sex
female    12569.578844
male      13974.998864
Name: charges, dtype: float64
Average Charges by Smoker:
 smoker
no      8440.660307
yes    32050.231832
Name: charges, dtype: float64


In [10]:
# Outlier Detection
# Box plots for charges and bmi
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['charges'])
plt.title('Box Plot of Charges')
plt.subplot(1, 2, 2)
sns.boxplot(y=df['bmi'])
plt.title('Box Plot of BMI')
plt.tight_layout()
plt.savefig('figures/outlier_boxplots.png')
plt.close()

# Remove outliers in charges using IQR method
Q1 = df['charges'].quantile(0.25)
Q3 = df['charges'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['charges'] < (Q1 - 1.5 * IQR)) | (df['charges'] > (Q3 + 1.5 * IQR)))]

In [11]:
# Data Comparison (Trends Over Geography)
# Average charges by region
premium_by_region = df.groupby('region')['charges'].mean()
plt.figure(figsize=(10, 6))
premium_by_region.plot(kind='bar')
plt.title('Average Charges by Region')
plt.ylabel('Average Charges')
plt.xticks(rotation=45)
plt.savefig('figures/charges_by_region_bar.png')
plt.close()

In [12]:
# Creative Visualizations
# Visualization 1: Average Charges by Region (Bar Chart)
plt.figure(figsize=(10, 6))
sns.barplot(x=charges_by_region.index, y=charges_by_region.values, palette='viridis')
plt.title('Average Charges by Region')
plt.xlabel('Region')
plt.ylabel('Average Charges')
plt.xticks(rotation=45)
plt.savefig('figures/creative_viz1_charges_by_region.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=charges_by_region.index, y=charges_by_region.values, palette='viridis')


In [13]:
# Visualization 2: Charges Distribution by Smoker Status (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoker', y='charges', data=df, palette='Set2')
plt.title('Charges Distribution by Smoker Status')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.savefig('figures/creative_viz2_charges_by_smoker.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='smoker', y='charges', data=df, palette='Set2')


In [14]:
# Visualization 3: Charges vs Age by Smoker Status (Scatter Plot)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='age', y='charges', hue='smoker', style='smoker', size='bmi', sizes=(20, 200))
plt.title('Charges vs Age by Smoker Status')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.legend(title='Smoker')
plt.savefig('figures/creative_viz3_charges_vs_age.png')
plt.close()