In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

import pandas as pd
import joblib

df = pd.read_csv(
    "C:/Projects/FYP/ML_SVNIT/medical_insurance - medical_insurance.csv"
)

bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

# -----------------------------
# Features & target
# -----------------------------
X = df.drop('charges', axis=1)
y = df['charges']

num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = X.select_dtypes(exclude='number').columns.tolist()

# -----------------------------
# Pipelines
# -----------------------------
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# -----------------------------
# ColumnTransformer
# -----------------------------
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# -----------------------------
# Full model pipeline
# -----------------------------
model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

# -----------------------------
# Train / Test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R2 Score :", r2_score(y_test, y_pred))
print("RMSE     :", mean_squared_error(y_test, y_pred))


joblib.dump(model, "model.pkl")
print("Model saved successfully as model.pkl")


R2 Score : 0.9501297140260642
RMSE     : 7654139.225924211
Model saved successfully as model.pkl
