In [3]:
# Import core Python libraries
import pandas as pd  # for data manipulation
import numpy as np   # for numerical computations

# Visualization libraries (optional if you've done EDA in Excel)
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for model building and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# For saving the trained model
import joblib

In [5]:
# Load your combined dataset that includes demographic + health check data
df = pd.read_excel("../data/Machine_Learning_Dataset.xlsx")
# Display the first few rows
df.head()

Unnamed: 0,Policy no,age,sex,bmi,charges in INR,Children,Smoker,Region,BMI Category,Age Group
0,PLC156898,19,female,27.9,16884.924,0,yes,southwest,Overweight,18-25
1,PLC156907,18,male,33.77,1725.5523,1,no,southeast,Obesity,18-25
2,PLC156916,28,male,33.0,4449.462,3,no,southeast,Obesity,26-35
3,PLC156925,33,male,22.705,21984.47061,0,no,northwest,Normal Weight,26-35
4,PLC156934,32,male,28.88,3866.8552,0,no,northwest,Overweight,26-35


In [None]:
# Separate the independent features (X) and target variable (y)
# Dropping 'charges in INR' from X because it's the target variable
# Dropping 'Policy no.' as it's just an identifier and not useful for prediction
X = df.drop(["charges in INR", "Policy no."], axis=1)

# Defining the target variable (y) as the medical charges
y = df["charges in INR"]

In [27]:
# Find numeric columns
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Find categorical columns (object means string/categorical)
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical Features:", num_features)
print("Categorical Features:", cat_features)

Numerical Features: ['age', 'bmi', 'Children']
Categorical Features: ['sex', 'Smoker', 'Region', 'BMI Category', 'Age Group']


In [28]:
#Split the Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [29]:
# Preprocess: scale numerical and encode categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),              # Scale numeric features
    ('cat', OneHotEncoder(drop='first'), cat_features)    # One-hot encode categorical features
])

# Create ML pipeline: preprocessing + model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [30]:
# Define hyperparameters to try
param_grid = {
    'model__n_estimators': [100, 200],           # Number of trees
    'model__max_depth': [10, 20, None],          # Tree depth
    'model__min_samples_split': [2, 5]           # Minimum samples to split a node
}

# Run GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error'
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 5, 'model__n_estimators': 100}


In [32]:
# Predict on test set
y_pred = grid_search.predict(X_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5  # RMSE = sqrt(MSE)
r2 = r2_score(y_test, y_pred)

# Display results
print("Model Performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R² Score:", r2)

Model Performance:
MAE: 2530.8891474849834
RMSE: 4501.178420164309
R² Score: 0.8694958279177537


In [33]:
# Save best model to reuse later without retraining
joblib.dump(grid_search.best_estimator_, "../models/best_model.pkl")

['../models/best_model.pkl']