# Medical Insurance — EDA and Model
Notebook to explore the dataset and train a baseline model.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
ROOT = Path.cwd().parents[0] if (Path.cwd()/ 'insurance.csv').exists() else Path.cwd()
df = pd.read_csv(ROOT / 'insurance.csv')
df.head()

In [None]:
df.describe(include='all')

In [None]:
fig, axes = plt.subplots(1,3, figsize=(12,3))
sns.histplot(df['age'], ax=axes[0], kde=True)
sns.histplot(df['bmi'], ax=axes[1], kde=True)
sns.histplot(df['charges'], ax=axes[2], kde=True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

target = 'charges'
X = df.drop(columns=[target])
y = df[target]
num = X.select_dtypes(include=[np.number]).columns
cat = X.select_dtypes(exclude=[np.number]).columns
pre = ColumnTransformer([('num', Pipeline([('sc', StandardScaler())]), num), ('cat', OneHotEncoder(handle_unknown='ignore'), cat)])
model = GradientBoostingRegressor(random_state=42)
pipe = Pipeline([('pre', pre), ('model', model)])
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(Xtr, ytr)
pred = pipe.predict(Xte)
mae = mean_absolute_error(yte, pred)
r2 = r2_score(yte, pred)
mae, r2
