In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

# Figure size default
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load dataset
df = pd.read_csv('../data/insurance.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

df.head(10)

# Basic info
df.info()

# Statistical summary
df.describe()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check data types
print("\nData Types:")
print(df.dtypes)

In [None]:

print("=== TARGET VARIABLE: CHARGES ===\n")

# Statistics
print(f"Mean: ${df['charges'].mean():,.2f}")
print(f"Median: ${df['charges'].median():,.2f}")
print(f"Std Dev: ${df['charges'].std():,.2f}")
print(f"Min: ${df['charges'].min():,.2f}")
print(f"Max: ${df['charges'].max():,.2f}")

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
axes[0].hist(df['charges'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Charges ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Charges')
axes[0].axvline(df['charges'].mean(), color='red', linestyle='--', label='Mean')
axes[0].axvline(df['charges'].median(), color='green', linestyle='--', label='Median')
axes[0].legend()

# Box plot
axes[1].boxplot(df['charges'])
axes[1].set_ylabel('Charges ($)')
axes[1].set_title('Box Plot of Charges')

# Q-Q plot (check normality)
stats.probplot(df['charges'], dist="norm", plot=axes[2])
axes[2].set_title('Q-Q Plot (Normality Check)')

plt.tight_layout()
plt.show()

# Check skewness
print(f"\nSkewness: {df['charges'].skew():.2f}")
print("(Right-skewed if > 0, Left-skewed if < 0)")


In [None]:

# ============================================================================
# 5. NUMERIC FEATURES ANALYSIS
# ============================================================================

print("\n=== NUMERIC FEATURES ANALYSIS ===\n")

numeric_cols = ['age', 'bmi', 'children']

# Distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, col in enumerate(numeric_cols):
    # Histogram
    axes[0, idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[0, idx].set_xlabel(col.upper())
    axes[0, idx].set_ylabel('Frequency')
    axes[0, idx].set_title(f'Distribution of {col.upper()}')
    
    # Box plot
    axes[1, idx].boxplot(df[col])
    axes[1, idx].set_ylabel(col.upper())
    axes[1, idx].set_title(f'Box Plot of {col.upper()}')

plt.tight_layout()
plt.show()


In [None]:

print("\n=== CATEGORICAL FEATURES ANALYSIS ===\n")

categorical_cols = ['sex', 'smoker', 'region']

for col in categorical_cols:
    print(f"\n{col.upper()} - Value Counts:")
    print(df[col].value_counts())
    print(f"\nPercentages:")
    print(df[col].value_counts(normalize=True) * 100)

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, col in enumerate(categorical_cols):
    # Count plot
    df[col].value_counts().plot(kind='bar', ax=axes[0, idx], color='skyblue', edgecolor='black')
    axes[0, idx].set_xlabel(col.upper())
    axes[0, idx].set_ylabel('Count')
    axes[0, idx].set_title(f'Distribution of {col.upper()}')
    axes[0, idx].tick_params(axis='x', rotation=45)
    
    # Box plot: categorical vs charges
    df.boxplot(column='charges', by=col, ax=axes[1, idx])
    axes[1, idx].set_xlabel(col.upper())
    axes[1, idx].set_ylabel('Charges ($)')
    axes[1, idx].set_title(f'Charges by {col.upper()}')
    plt.sca(axes[1, idx])
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:

print("\n=== CORRELATION ANALYSIS ===\n")

# Correlation matrix (numeric features only)
corr_matrix = df[numeric_cols + ['charges']].corr()
print(corr_matrix)

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Pairplot
sns.pairplot(df[numeric_cols + ['charges']], diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Numeric Features', y=1.02)
plt.show()


In [None]:

print("\n=== KEY FEATURE INTERACTIONS ===\n")

# AGE vs CHARGES (by smoker)
plt.figure(figsize=(12, 6))
for smoker_status in df['smoker'].unique():
    subset = df[df['smoker'] == smoker_status]
    plt.scatter(subset['age'], subset['charges'], alpha=0.6, label=f'Smoker: {smoker_status}')
plt.xlabel('Age')
plt.ylabel('Charges ($)')
plt.title('Age vs Charges (colored by Smoker Status)')
plt.legend()
plt.show()

# BMI vs CHARGES (by smoker)
plt.figure(figsize=(12, 6))
for smoker_status in df['smoker'].unique():
    subset = df[df['smoker'] == smoker_status]
    plt.scatter(subset['bmi'], subset['charges'], alpha=0.6, label=f'Smoker: {smoker_status}')
plt.xlabel('BMI')
plt.ylabel('Charges ($)')
plt.title('BMI vs Charges (colored by Smoker Status)')
plt.legend()
plt.show()

# Statistical test: Smoker vs Non-smoker charges
smoker_charges = df[df['smoker'] == 'yes']['charges']
non_smoker_charges = df[df['smoker'] == 'no']['charges']

print(f"Smoker mean charges: ${smoker_charges.mean():,.2f}")
print(f"Non-smoker mean charges: ${non_smoker_charges.mean():,.2f}")
print(f"Difference: ${smoker_charges.mean() - non_smoker_charges.mean():,.2f}")

# T-test
t_stat, p_value = stats.ttest_ind(smoker_charges, non_smoker_charges)
print(f"\nT-test p-value: {p_value:.2e}")
print("Statistically significant!" if p_value < 0.05 else "Not significant")


In [None]:

print("\n=== OUTLIER DETECTION ===\n")

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

for col in numeric_cols + ['charges']:
    outliers, lower, upper = detect_outliers_iqr(df, col)
    print(f"\n{col.upper()}:")
    print(f"  Lower bound: {lower:.2f}")
    print(f"  Upper bound: {upper:.2f}")
    print(f"  Number of outliers: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")


In [None]:

print("\n=== KEY FEATURE INTERACTIONS ===\n")

# AGE vs CHARGES (by smoker)
plt.figure(figsize=(12, 6))
for smoker_status in df['smoker'].unique():
    subset = df[df['smoker'] == smoker_status]
    plt.scatter(subset['age'], subset['charges'], alpha=0.6, label=f'Smoker: {smoker_status}')
plt.xlabel('Age')
plt.ylabel('Charges ($)')
plt.title('Age vs Charges (colored by Smoker Status)')
plt.legend()
plt.show()

# BMI vs CHARGES (by smoker)
plt.figure(figsize=(12, 6))
for smoker_status in df['smoker'].unique():
    subset = df[df['smoker'] == smoker_status]
    plt.scatter(subset['bmi'], subset['charges'], alpha=0.6, label=f'Smoker: {smoker_status}')
plt.xlabel('BMI')
plt.ylabel('Charges ($)')
plt.title('BMI vs Charges (colored by Smoker Status)')
plt.legend()
plt.show()

# Statistical test: Smoker vs Non-smoker charges
smoker_charges = df[df['smoker'] == 'yes']['charges']
non_smoker_charges = df[df['smoker'] == 'no']['charges']

print(f"Smoker mean charges: ${smoker_charges.mean():,.2f}")
print(f"Non-smoker mean charges: ${non_smoker_charges.mean():,.2f}")
print(f"Difference: ${smoker_charges.mean() - non_smoker_charges.mean():,.2f}")

# T-test
t_stat, p_value = stats.ttest_ind(smoker_charges, non_smoker_charges)
print(f"\nT-test p-value: {p_value:.2e}")
print("Statistically significant!" if p_value < 0.05 else "Not significant")
