In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [23]:
# Core Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
COLORS = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E']

# Machine Learning - Regression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score, 
                             mean_absolute_percentage_error)

# Statistical Analysis
from scipy import stats
from scipy.stats import pearsonr

print("âœ… All libraries imported successfully!")
print(f"ðŸ“¦ Pandas version: {pd.__version__}")
print(f"ðŸ“¦ NumPy version: {np.__version__}")
print(f"ðŸ“Š Dataset: 100,000 Insurance Records | 54+ Features")



âœ… All libraries imported successfully!
ðŸ“¦ Pandas version: 3.0.0
ðŸ“¦ NumPy version: 2.4.2
ðŸ“Š Dataset: 100,000 Insurance Records | 54+ Features


In [24]:
df = pd.read_csv("medical_insurance.csv")

In [25]:
data = df.copy()

In [26]:
# Drop leakage-prone columns
drop_cols = ["claims_count", "avg_claim_amount", "total_claims_paid", "is_high_risk", "had_major_procedure"]
data.drop(columns=drop_cols, errors="ignore", inplace=True)

In [27]:
# 3.1 --- LEAKAGE AUDIT ---
target = 'annual_medical_cost'
corr = data.corr(numeric_only=True)[target].abs().sort_values(ascending=False)

# Flag suspicious correlations > 0.85
suspect_features = corr[corr > 0.85].index.tolist()
print("ðŸš¨ Possible Leakage Features:", suspect_features)

# Optional: drop those if domain knowledge confirms post-t0
data = data.drop(columns=[f for f in suspect_features if f != target], errors='ignore')

# 3.2 --- PROVENANCE TAGGING ---
provenance = {}
for col in data.columns:
    if col in ['person_id', 'sex_num', 'education_num', 'region_num']:
        provenance[col] = 'static'
    elif any(k in col for k in ['visit','hosp','medication','premium','deductible','copay','risk_proxy']):
        provenance[col] = 'pre_t0_dynamic'
    else:
        provenance[col] = 'uncertain'

provenance_data = pd.DataFrame.from_dict(provenance, orient='index', columns=['provenance'])

# 3.3 --- DATA SPLITTING STRATEGY ---

# If person_id repeats across rows -> group split; else simple train/test
if data['person_id'].duplicated().any():
    splitter = GroupKFold(n_splits=5)
    for fold, (train_idx, val_idx) in enumerate(splitter.split(data, data[target], groups=data['person_id'])):
        print(f"Fold {fold}: Train={len(train_idx)}, Val={len(val_idx)}")
        break  # preview one fold
else:
    # Single occurrence per person -> random split
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)
    print(f"Train={len(train_data)}, Val={len(val_data)}, Test={len(test_data)}")

# 3.4 --- PERMUTATION SANITY CHECK (optional quick)
# Baseline performance with shuffled target to ensure no leakage signals
# (to be done after model is built)

ðŸš¨ Possible Leakage Features: ['annual_medical_cost', 'monthly_premium', 'annual_premium']
Train=80000, Val=10000, Test=10000


In [28]:
data.columns

Index(['person_id', 'age', 'sex', 'region', 'urban_rural', 'income',
       'education', 'marital_status', 'employment_status', 'household_size',
       'dependents', 'bmi', 'smoker', 'alcohol_freq', 'visits_last_year',
       'hospitalizations_last_3yrs', 'days_hospitalized_last_3yrs',
       'medication_count', 'systolic_bp', 'diastolic_bp', 'ldl', 'hba1c',
       'plan_type', 'network_tier', 'deductible', 'copay', 'policy_term_years',
       'policy_changes_last_2yrs', 'provider_quality', 'risk_score',
       'annual_medical_cost', 'chronic_count', 'hypertension', 'diabetes',
       'asthma', 'copd', 'cardiovascular_disease', 'cancer_history',
       'kidney_disease', 'liver_disease', 'arthritis', 'mental_health',
       'proc_imaging_count', 'proc_surgery_count', 'proc_physio_count',
       'proc_consult_count', 'proc_lab_count'],
      dtype='str')

In [29]:
# --- PHASE 4: MODEL CREATION (Baseline Models + Preprocessing Pipeline) ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Copy the engineered dataset
data_model = data.copy()

# Drop redundant and target columns
data_model = data_model.drop(columns=['monthly_premium'], errors='ignore')
target = 'annual_medical_cost'

# Define features and target
X = data_model.drop(columns=[target, 'person_id'], errors='ignore')
y = data_model[target]

# Split into train, validation, test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = [col for col in X_train.columns if col not in numeric_features]

# --- PREPROCESSING PIPELINE ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# --- BASELINE MODELS ---
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "XGBRegressor":XGBRegressor(),
    "RandomForestRegressor":RandomForestRegressor()
}

results = []

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    
    mae = mean_absolute_error(y_val, preds)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})

# Convert to DataFrame for display
results_df = pd.DataFrame(results)

In [30]:
results_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LinearRegression,1756.881537,2773.010482,0.179087
1,Ridge,1756.886623,2773.019412,0.179082
2,Lasso,1757.087883,2773.373759,0.178872
3,XGBRegressor,1806.650335,2875.574222,0.117239
4,RandomForestRegressor,1865.15787,2861.928268,0.125597
