In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
df = pd.read_csv('insurance.csv')


df.head(10)




#Checking and cleaning data

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())
print(df['sex'].unique())
print(df['age'].unique())
print(df['children'].unique())
print(df['smoker'].unique())
print(df['region'].unique())
df.nunique()

In [None]:
print("Minimum charges: $",df['charges'].min())
print("Maximum charges: $",df['charges'].max())
print("Minimum BMI:",df['bmi'].min())
print("Maximum BMI:",df['bmi'].max())

In [None]:
print(df.duplicated().sum())

In [None]:
df[df.duplicated()]

In [None]:
df[(df["bmi"] == 30.59) & (df["charges"] ==1639.5631 )]

In [None]:
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

#Categorizing data

In [None]:
df['age_group'] = df['age'].apply(lambda x: '<35' if x <35 else ('35-44' if x <=44 else ('45-54' if x <=54 else('55-64' if x <=64 else '65+'))))
df['bmi_group'] = df['bmi'].apply(lambda x: 'normal' if x <= 24.9 else ('overweight' if x <= 30 else 'obese'))
groups = (
    df.groupby(['sex','age_group','bmi_group','smoker'], observed=True)
      .agg(
          n=('charges', 'size'),
          median=('charges', 'median'),
          min=('charges', 'min'),
          max=('charges', 'max')
      )
      .reset_index()
)
cols_to_round = ['median', 'min', 'max']
groups[cols_to_round] = groups[cols_to_round].round(2)
groups['n'] = groups['n'].astype(int)

# смотреть топ по размеру группы
groups.sort_values('n', ascending=False).head()

groups.head().sort_values(by='n', ascending=False)

In [None]:
tot = df['sex'].value_counts()
ax = sns.countplot(
    data=df,
    x='sex',
    order=['female','male'],
    palette=['pink', 'lightblue']  # female -> pink, male -> lightblue
)
ax.set_ylabel('count')
ax.set_xlabel(f"female {tot.get('female',0)} | male {tot.get('male',0)}")
plt.show()

In [None]:

df_plot = df.copy()
df_plot['smoker'] = df_plot['smoker'].replace({'yes': 'Smoker', 'no': 'Non-smoker'})

tot = df_plot['smoker'].value_counts()

# Pastel red and green (order matches the 'order' parameter)
pastel_colors = ['#FF9999', '#77DD77']  # Smoker -> pastel red, Non-smoker -> pastel green

ax = sns.countplot(
    data=df_plot,
    x='smoker',
    order=['Smoker','Non-smoker'],
    palette=pastel_colors
)

ax.set_ylabel('Count')
ax.set_xlabel(f"Smoker {tot.get('Smoker',0)} | Non-smoker {tot.get('Non-smoker',0)}")

# Add counts on top of bars
for c in ax.containers:
    ax.bar_label(c, fmt='%d', padding=3)

plt.show()


In [None]:
bmi_order = ['normal', 'overweight', 'obese']
age_order = ['<35', '35-44', '45-54', '55-64', '65+']

df_plot = df.copy()

df_plot['bmi_group'] = pd.Categorical(df_plot['bmi_group'], categories=bmi_order, ordered=True)
df_plot['age_group'] = pd.Categorical(df_plot['age_group'], categories=age_order, ordered=True)

bmi_counts = (df_plot['bmi_group'].value_counts()
              .reindex(bmi_order).fillna(0).astype(int))
age_counts = (df_plot['age_group'].value_counts()
              .reindex(age_order).fillna(0).astype(int))


def make_autopct(total):
    return lambda p: f"{p:.1f}%\n({int(round(p*total/100))})"

colors_bmi = plt.get_cmap('Pastel1').colors[:len(bmi_order)]
colors_age = plt.get_cmap('Pastel1').colors[:len(age_order)]

fig, axes = plt.subplots(1, 2, figsize=(12, 5))


total_bmi = int(bmi_counts.sum())
axes[0].pie(bmi_counts.values, labels=bmi_counts.index.tolist(),
            autopct=make_autopct(total_bmi), startangle=90,
            colors=colors_bmi, pctdistance=0.8)
axes[0].axis('equal')
axes[0].set_title(f"BMI groups (n={total_bmi})")


total_age = int(age_counts.sum())
axes[1].pie(age_counts.values, labels=age_counts.index.tolist(),
            autopct=make_autopct(total_age), startangle=90,
            colors=colors_age, pctdistance=0.8)
axes[1].axis('equal')
axes[1].set_title(f"Age groups (n={total_age})")

plt.suptitle("Distribution of BMI and Age groups")
plt.tight_layout()
plt.show()



#Determining which factors impact the charges


In [None]:
# Set the style for the plots
sns.set_style("whitegrid")

# Create a scatter plot for charges vs. age
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='charges', data=df)
plt.title('1. Dependence of Medical Charges on Age', fontsize=16)
plt.xlabel('Age')
plt.ylabel('Charges')
plt.show()

In [None]:
# Create a scatter plot for charges vs. BMI
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='charges', data=df)
plt.title('2. Dependence of Medical Charges on BMI', fontsize=16)
plt.xlabel('bmi')
plt.ylabel('Charges')
plt.show()

In [None]:
# Create a box plot for charges vs. smoker
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoker', y='charges', data=df, palette='coolwarm')
plt.title('4. Dependence of Medical Charges on Smoking Status', fontsize=16)
plt.xlabel('Smoker')
plt.ylabel('Medical Charges')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='sex', y='charges', data=df, palette='coolwarm')
plt.title('4. Dependence of Medical Charges on gender', fontsize=16)
plt.xlabel('Gender')
plt.ylabel('Medical Charges')
palette=['pink', 'lightblue']
plt.show()

In [None]:

plt.figure(figsize=(10, 6))
sns.boxplot(x='children', y='charges', data=df, palette='pastel')
plt.title('3. Dependence of Medical Charges on Number of Children', fontsize=16)
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.show()

In [None]:
# Create a box plot for charges vs. region
plt.figure(figsize=(10, 6))
sns.boxplot(x='region', y='charges', data=df, palette='viridis')
plt.title('5. Dependence of Medical Charges on Region', fontsize=16)
plt.xlabel('Region')
plt.ylabel('Medical Charges')
plt.show()

In [None]:
# Charges: BMI & smoking

plt.figure(figsize=(12, 8))
sns.scatterplot(x='bmi', y='charges', hue='smoker', data=df, palette='viridis', style='smoker')
plt.title('Medical expenses vs. BMI, by smoking status')
plt.xlabel('BMI')
plt.ylabel('charges')
plt.show()
print("Conclusion: The visualization shows that smokers have significantly higher costs, regardless of their BMI.")


In [None]:
# 1) Normalize smoker values and coerce numeric columns
df = df.copy()
df["smoker"] = df["smoker"].astype(str).str.strip().str.lower()

num_cols = ["age", "bmi", "children", "charges"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

groups = ["yes", "no"]

fig, axes = plt.subplots(1, 2, figsize=(10, 4), constrained_layout=True)

for ax, s in zip(axes, groups):
    sub = df[df["smoker"] == s]
    n = len(sub)
    ax.set_title(f"Spearman ρ — smoker={s} (n={n})")

    if n < 3:
        ax.text(0.5, 0.5, "Not enough rows", ha="center", va="center")
        ax.axis("off")
        continue

    # 2) Drop columns that are all-NaN or constant in this subset
    usable = []
    for c in num_cols:
        series = sub[c].dropna()
        if series.nunique() >= 2:
            usable.append(c)
    if len(usable) < 2:
        ax.text(0.5, 0.5, "No varying numeric columns", ha="center", va="center")
        ax.axis("off")
        continue

    # 3) Let pandas compute Spearman (handles NaNs)
    mat = sub[usable].corr(method="spearman")

    # 4) Plot
    sns.heatmap(mat, annot=True, vmin=-1, vmax=1, cmap="YlOrRd", ax=ax, square=True)

plt.show()

#Finding how different factors impact charges

## Data Preprocessing

In [None]:
# categorical groups in order
AGE_GROUP = ["<35", "35-44", "45-54", "55-64", "65+"]
BMI_GROUP = ["normal","overweight","obese"]
SEX_GROUP = ['female','male']

# Bins
AGE_BINS = [0, 34, 44, 54, 64, np.inf]
BMI_BINS = [0, 25, 30, np.inf]

In [None]:
def categorize_feature(df, feature_col, group_list, bin_list, right=True, include_lowest=True):
  return pd.cut(df[feature_col], bins=bin_list, labels=group_list, right=right, include_lowest=include_lowest)

In [None]:
df['age_group'] = categorize_feature(df, 'age', group_list=AGE_GROUP, bin_list=AGE_BINS)
df[['age', 'age_group']].tail()

In [None]:
df['bmi_group'] = categorize_feature(df, 'bmi', group_list=BMI_GROUP, bin_list=BMI_BINS)
df[['bmi', 'bmi_group']].tail()

In [None]:
df.head()

## Smoking & BMI vs Charges



The “smoker premium” = the extra cost smokers pay at each BMI level.

### Create Median ratios

In [None]:
med = df.groupby(["bmi_group","smoker"], observed=False)["charges"].median().unstack()
prem_abs = (med["yes"] - med["no"]).rename("smoker_premium_$")
prem_rel = (med["yes"]/med["no"]).rename("smoker_premium_ratio")
smoker_premium = pd.concat([med.round(0), prem_abs.round(0), prem_rel.round(2)], axis=1)
sp = smoker_premium.reset_index().rename(
    columns={smoker_premium.index.name or 'index': 'bmi_group'})
sp

In [None]:

med_long = sp.melt(
    id_vars="bmi_group",
    value_vars=["no","yes"],
    var_name="smoker",
    value_name="median_charges"
).dropna()
med_long

## Plot 1 -  Median Charges by BMI group and smoker

In [None]:
plt.figure(figsize=(7,4))

pastel_colors = {"no": "#BFFCC6", "yes": "#A3C4FF"}

sns.barplot(
    data=med_long,
    x="bmi_group",
    y="median_charges",
    hue="smoker",
    palette=pastel_colors
)

plt.title("Median charges by BMI group and smoker",fontweight="bold")
plt.xlabel("BMI group")
plt.ylabel("Median charges ($)")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


## Plot 2 - Smoker Premium in $ and ratio

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12,4), constrained_layout=True)

# Absolute $ - soft blue
sns.barplot(
    data=sp,
    x="bmi_group",
    y="smoker_premium_$",
    ax=axes[0],
    color="#A3C4FF"
)
axes[0].axhline(0, ls="--", lw=1)
axes[0].set_title("Smoker premium ($) by BMI group",fontweight="bold")
axes[0].set_xlabel("BMI group")
axes[0].set_ylabel("Median difference ($)")
axes[0].tick_params(axis="x", rotation=0)

# Ratio × - soft blue
sns.pointplot(
    data=sp,
    x="bmi_group",
    y="smoker_premium_ratio",
    ax=axes[1],
    markers="o",
    color="#A3C4FF",
    errorbar=None
)
axes[1].axhline(1, ls="--", lw=1)
axes[1].set_title("Smoker premium (x) by BMI group",fontweight="bold")
axes[1].set_xlabel("BMI group")
axes[1].set_ylabel("Smokers / Non-smokers")
axes[1].tick_params(axis="x", rotation=0)

plt.show()

Smoker premium (absolute $):

- Smokers consistently pay a large extra cost.

- The premium increases sharply with obesity, reaching over $30k difference.

Smoker premium (relative ×):

- For normal and overweight individuals, smokers pay about 3× more than non-smokers.

- For obese individuals, the multiplier jumps above 5×, showing a strong combined effect of obesity and smoking.

**Conclusion: Smoking greatly increases medical costs, and the effect is amplified when combined with obesity**

## Plot 3 - Focus on Obese Only: Median Charges by Age

### Feature Engineering

In [None]:
# Let's focus now on obese only
obese = df[df["bmi_group"] == "obese"].copy()

# Medians and plot
med_sm = (obese.groupby(["age_group","smoker"], observed=True)["charges"]
                .median().reset_index(name="median_charges"))
med_sm.head()


In [None]:
pv = (med_sm.pivot(index="age_group", columns="smoker", values="median_charges")
              .reindex(AGE_GROUP))
pv.head()


### Plot

In [None]:
ax = sns.pointplot(data=med_sm, x="age_group", y="median_charges",
                   hue="smoker", order=AGE_GROUP, dodge=True,
                   markers="o", linestyles="-", errorbar=None)

for xi, lab in enumerate(AGE_GROUP):
    if lab not in pv.index:
        continue
    vals = pv.loc[lab, ["no","yes"]]
    if vals.notna().all():
        ax.annotate("", xy=(xi, vals["yes"]), xytext=(xi, vals["no"]),
                    arrowprops=dict(arrowstyle="<->", lw=1.4, color="#666",
                                    shrinkA=3, shrinkB=3))
        ax.text(xi, vals.mean(), f"+${(vals['yes']-vals['no']):,.0f}",
                ha="center", va="center", fontsize=9,
                bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=1.5))

ax.set_title("Obese only — Median charges by age group (smoker split)", fontweight="bold")
ax.set_xlabel("age_group"); ax.set_ylabel("median_charges")
plt.tight_layout(); plt.show()


**Conclusion:For obese individuals, smoking adds a large and consistent cost premium of about $ 33-34K  across all age groups.**

## Plot 4 - Smoker Premium (ratio) by Age, for different BMIs

### Feature Engineering

In [None]:
med2 = (df[df["bmi_group"].isin(['normal', 'overweight', 'obese'])]).groupby(["age_group","smoker","bmi_group"], observed=False)["charges"].median().dropna().astype(int).unstack()
med2["obesity_penalty_$"] = (med2["obese"] - med2["normal"])
med2["obesity_penalty_ratio"] = (med2["obese"]/med2["normal"]).round(2)
med2

In [None]:
# Median charges by (age_band, bmi_group, smoker)
med = df.pivot_table(index=["age_group","bmi_group"],
                    columns="smoker", values="charges",
                    aggfunc="median", observed=True).reindex(columns=["no","yes"])
med.head()


In [None]:
# smoker premium ratio; keep combos where both sides exist
work = med.dropna().reset_index()
work["premium_ratio"] = work["yes"] / work["no"]
work.head()


### Plot

In [None]:
# Plot
sns.pointplot(
    data=work, x="age_group", y="premium_ratio",
    hue="bmi_group", hue_order=BMI_GROUP,
    dodge=0.3, markers="o", errorbar=None
)
plt.axhline(1, ls="--", lw=1)
plt.title("Smoker premium (×) by age — Normal vs Overweight vs Obese")
plt.xlabel("Age group"); plt.ylabel("Smokers / Non-smokers (median)")
plt.tight_layout()
plt.show()

work[["age_group","bmi_group","premium_ratio"]].round(2).sort_values(["bmi_group","age_group"])


**Conclusion:**
- The smoker premium is highest at younger ages and especially extreme for obese individuals (<35 pay over 10X more than non-smokers).
- As age increases, the smoker premium decreases across all BMI groups, but obese smokers consistently face the largest relative cost penalty at every age.

# Plot 5 - Focus on Non Smokers

### Feature Engineering

In [None]:
non_smoker = df[df["smoker"]=="no"].copy()
non_smoker.head()

In [None]:
med_age_bmi = (non_smoker.groupby(["age_group","bmi_group"],observed=True)["charges"]
                 .median().reset_index())
med_age_bmi.head()

### Plot

In [None]:
plt.figure(figsize=(7,4))
sns.pointplot(data=med_age_bmi, x="age_group", y="charges", hue="bmi_group",
              dodge=0.3, markers="o", errorbar=None)
plt.title("Non-smokers — Median charges by age band and BMI")
plt.xlabel("Age Group"); plt.ylabel("Median charges ($)")
plt.tight_layout(); plt.show()
print(med_age_bmi.pivot(index="age_group", columns="bmi_group", values="charges").round(0))

- For non-smokers, medical charges rise steadily with age, but BMI has little effect.
- Normal, overweight, and obese non-smokers follow almost the same trend, with only minor differences.

**Conclusion: Age is the main driver of costs for non-smokers - not BMI**

# Summary



1.   Smoking greatly increases medical costs, and the effect is amplified when combined with obesity
2.   For obese individuals, smoking adds a large and consistent cost premium of about $ 33-34K across all age groups.
3.   The smoker premium is highest at younger ages and especially extreme for obese individuals (<35 pay over 10X more than non-smokers).
4.   Obesity by itself is a modest cost factor, much smaller than the effect of smoking.However,when combined with smoking,it becomes an aggravating factor
5. Age is the main driver of costs for non-smokers - not BMI