In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('E:\Projects\AI_playground\datasets\medical_insurance.csv')
df.head()

  df = pd.read_csv('E:\Projects\AI_playground\datasets\medical_insurance.csv')


Unnamed: 0,person_id,age,sex,region,urban_rural,income,education,marital_status,employment_status,household_size,...,liver_disease,arthritis,mental_health,proc_imaging_count,proc_surgery_count,proc_physio_count,proc_consult_count,proc_lab_count,is_high_risk,had_major_procedure
0,75722,52,Female,North,Suburban,22700.0,Doctorate,Married,Retired,3,...,0,1,0,1,0,2,0,1,0,0
1,80185,79,Female,North,Urban,12800.0,No HS,Married,Employed,3,...,0,1,1,0,0,1,0,1,1,0
2,19865,68,Male,North,Rural,40700.0,HS,Married,Retired,5,...,0,0,1,1,0,2,1,0,1,0
3,76700,15,Male,North,Suburban,15600.0,Some College,Married,Self-employed,5,...,0,0,0,1,0,0,1,0,0,0
4,92992,53,Male,Central,Suburban,89600.0,Doctorate,Married,Self-employed,2,...,0,1,0,2,0,1,1,0,1,0


In [None]:
# Drop Leakage Columns
leakage_cols = ["person_id",
    "risk_score",
    "claims_count",
    "avg_claim_amount",
    "total_claims_paid",
    "annual_premium",
    "monthly_premium",
    "is_high_risk",
    "had_major_procedure"]

df.drop(leakage_cols, axis = 1, inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   age                          100000 non-null  int64  
 1   sex                          100000 non-null  object 
 2   region                       100000 non-null  object 
 3   urban_rural                  100000 non-null  object 
 4   income                       100000 non-null  float64
 5   education                    100000 non-null  object 
 6   marital_status               100000 non-null  object 
 7   employment_status            100000 non-null  object 
 8   household_size               100000 non-null  int64  
 9   dependents                   100000 non-null  int64  
 10  bmi                          100000 non-null  float64
 11  smoker                       100000 non-null  object 
 12  alcohol_freq                 69917 non-null   object 
 13  

In [7]:
# Feature Engineering (INSURANCE-SPECIFIC)
df["risk_load"] = (
    df["chronic_count"] + df["medication_count"] + df["visits_last_year"]
)

df["metabolic_risk"] = df["bmi"] * df["hba1c"]
df["age_chronic"] = df["age"] * df["chronic_count"]

In [8]:
# Target Log Transformation
y = np.log1p(df["annual_medical_cost"])
x = df.drop("annual_medical_cost", axis=1)

In [9]:
# Feature Types
categorical_cols = [
    'smoker', 'alcohol_freq', 'sex', 'region', 'urban_rural',
    'marital_status', 'education', 'employment_status',
    'plan_type', 'network_tier'
]

numerical_cols = [col for col in x.columns if col not in categorical_cols]

In [10]:
# preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [11]:
# Base Models
ridge = Ridge(alpha = 1.0)
gbr = GradientBoostingRegressor(
    n_estimators = 300,
    learning_rate =0.05,
    max_depth = 4,
    random_state = 42
)
xgb = XGBRegressor(
    n_estimators = 600,
    learning_rate = 0.05,
    max_depth = 6,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = "reg:squarederror",
    random_state = 42
)

In [12]:
# Stacking Regressor

stack_model = StackingRegressor(
    estimators = [
        ("ridge", ridge),
        ("gbr", gbr),
        ("xgb", xgb)
    ],
    final_estimator = Ridge(alpha = 0.5),
    passthrough=False
)

In [None]:
# Full Pipeline

pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("model", stack_model)
    ]
)

In [14]:
# Cross-Validation Evaluation

cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline,
    x,
    y,
    cv=cv,
    scoring="r2"
)

print("Mean CV R² (log scale):", scores.mean())
print("Std CV R²:", scores.std())

Mean CV R² (log scale): 0.2184991212281727
Std CV R²: 0.004860389620463872


In [15]:
# Final Train-Test Evaluation

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Test R² (log scale):", r2_score(y_test, y_pred))

Test R² (log scale): 0.21746128302939227
