In [121]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

import optuna
import torch


In [122]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium
4,69,62.2,1.60,3.94000,True,Indore,retired,High
...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low


In [123]:
df.isnull().sum()

age                           0
weight                        0
height                        0
income_lpa                    0
smoker                        0
city                          0
occupation                    0
insurance_premium_category    0
dtype: int64

In [124]:
df.describe()

Unnamed: 0,age,weight,height,income_lpa
count,100.0,100.0,100.0,100.0
mean,47.18,83.894,1.7132,18.4006
std,16.649312,21.020278,0.110205,16.067465
min,18.0,51.1,1.5,0.53
25%,34.75,63.65,1.61,2.8975
50%,47.0,82.3,1.73,14.122583
75%,61.0,101.3,1.81,30.1625
max,75.0,119.8,1.9,50.0


In [125]:
df_feat = df.copy()
df_feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium
4,69,62.2,1.60,3.94000,True,Indore,retired,High
...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low


**Feature Engineering**

In [126]:
# Feature 1: BMI 
df_feat['BMI'] = df_feat['weight'] / (df_feat['height'] ** 2)
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875


In [127]:
# Feature 2: Age Group 
def age_group(age):
    if age <= 18:
        return "Child"
    elif age > 18 and age <= 40:
        return "Young Adult"
    elif age > 40 and age <= 60:
        return "Middle Aged Adult"
    else:
        return "Senior Citizen"
    

df_feat['Age Group'] = df_feat['age'].apply(age_group)
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI,Age Group
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,Senior Citizen
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,Young Adult
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,Young Adult
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,Young Adult
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,Senior Citizen


In [128]:
# Feature 3: Lifestyle Risk - Combination of Smoker and BMI
def lifestyle_risk(row):
    if row['smoker'] and row['BMI'] > 30:
        return "High"
    elif row['smoker'] and row['BMI'] > 25:
        return "Medium"
    else:
        return "Low"


df_feat["Lifestyle Risk"] = df_feat.apply(lifestyle_risk, axis=1)
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI,Age Group,Lifestyle Risk
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,Senior Citizen,Low
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,Young Adult,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,Young Adult,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,Young Adult,High
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,Senior Citizen,Low


In [129]:
# Feature 4: City Tier
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"]

def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


df_feat["City Tier"] = df_feat["city"].apply(city_tier)
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI,Age Group,Lifestyle Risk,City Tier
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,Senior Citizen,Low,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,Young Adult,Low,1
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,Young Adult,Low,2
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,Young Adult,High,1
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,Senior Citizen,Low,2


In [130]:
df_feat = df_feat[['age', 'income_lpa', 'BMI', 'smoker', 
                    'occupation', 'Age Group', 
                    'City Tier', 'Lifestyle Risk', 
                    'insurance_premium_category']]

df_feat.head()

Unnamed: 0,age,income_lpa,BMI,smoker,occupation,Age Group,City Tier,Lifestyle Risk,insurance_premium_category
0,67,2.92,49.227482,False,retired,Senior Citizen,2,Low,High
1,36,34.28,30.189017,False,freelancer,Young Adult,1,Low,Low
2,39,36.64,21.118382,False,freelancer,Young Adult,2,Low,Low
3,22,3.34,45.5359,True,student,Young Adult,1,High,Medium
4,69,3.94,24.296875,True,retired,Senior Citizen,2,Low,High


In [131]:
X = df_feat.drop(columns=['insurance_premium_category'])
y = df_feat['insurance_premium_category']

print(X.head())
print(y.head())

   age  income_lpa        BMI  ...       Age Group City Tier Lifestyle Risk
0   67        2.92  49.227482  ...  Senior Citizen         2            Low
1   36       34.28  30.189017  ...     Young Adult         1            Low
2   39       36.64  21.118382  ...     Young Adult         2            Low
3   22        3.34  45.535900  ...     Young Adult         1           High
4   69        3.94  24.296875  ...  Senior Citizen         2            Low

[5 rows x 8 columns]
0      High
1       Low
2       Low
3    Medium
4      High
Name: insurance_premium_category, dtype: object


In [132]:
df_feat.head()

Unnamed: 0,age,income_lpa,BMI,smoker,occupation,Age Group,City Tier,Lifestyle Risk,insurance_premium_category
0,67,2.92,49.227482,False,retired,Senior Citizen,2,Low,High
1,36,34.28,30.189017,False,freelancer,Young Adult,1,Low,Low
2,39,36.64,21.118382,False,freelancer,Young Adult,2,Low,Low
3,22,3.34,45.5359,True,student,Young Adult,1,High,Medium
4,69,3.94,24.296875,True,retired,Senior Citizen,2,Low,High


In [133]:
# Define categorical and numerical features

categorical_features = ['occupation', 'Age Group', 'Lifestyle Risk', 'City Tier', 'smoker']
numerical_features = ["income_lpa", 'BMI', 'age']

In [134]:
# Create a ColumnTransformer to handle categorical and numerical features

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

In [135]:
# Create a pipeline with preprocessor and RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [136]:
# Spliting the data and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

In [137]:
# Prediction and evaluation
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.75

**Hyper Parameter Tuning using Optuna**

In [138]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 150, 500, step=10)  # Number of trees in the forest
    max_depth = trial.suggest_int('max_depth', 5, 10)  # Maximum depth of the trees
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)  # Minimum number of samples required to split an internal node
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)  # Minimum number of samples required to be at a leaf node
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2']) # Number of features to consider when looking for the best split

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1 # Use all available CPU cores for training
    )

    pipeline_optuna = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    score = cross_val_score(pipeline_optuna, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score

In [139]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)


[I 2025-12-05 18:33:33,981] A new study created in memory with name: no-name-21390e4b-b781-42a5-910d-742549deb45c
Best trial: 0. Best value: 0.725:   2%|▏         | 1/50 [00:04<03:57,  4.85s/it]

[I 2025-12-05 18:33:38,852] Trial 0 finished with value: 0.725 and parameters: {'n_estimators': 480, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.725.


Best trial: 1. Best value: 0.7625:   4%|▍         | 2/50 [00:09<03:45,  4.69s/it]

[I 2025-12-05 18:33:43,434] Trial 1 finished with value: 0.7625 and parameters: {'n_estimators': 470, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:   6%|▌         | 3/50 [00:13<03:21,  4.29s/it]

[I 2025-12-05 18:33:47,254] Trial 2 finished with value: 0.7125 and parameters: {'n_estimators': 380, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:   8%|▊         | 4/50 [00:16<02:57,  3.85s/it]

[I 2025-12-05 18:33:50,426] Trial 3 finished with value: 0.6875 and parameters: {'n_estimators': 310, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  10%|█         | 5/50 [00:20<03:01,  4.03s/it]

[I 2025-12-05 18:33:54,765] Trial 4 finished with value: 0.7375 and parameters: {'n_estimators': 430, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  12%|█▏        | 6/50 [00:23<02:39,  3.62s/it]

[I 2025-12-05 18:33:57,600] Trial 5 finished with value: 0.7375 and parameters: {'n_estimators': 280, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  14%|█▍        | 7/50 [00:25<02:17,  3.21s/it]

[I 2025-12-05 18:33:59,955] Trial 6 finished with value: 0.7 and parameters: {'n_estimators': 230, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  16%|█▌        | 8/50 [00:29<02:13,  3.17s/it]

[I 2025-12-05 18:34:03,036] Trial 7 finished with value: 0.75 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  18%|█▊        | 9/50 [00:33<02:20,  3.42s/it]

[I 2025-12-05 18:34:07,022] Trial 8 finished with value: 0.725 and parameters: {'n_estimators': 390, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  20%|██        | 10/50 [00:35<02:08,  3.22s/it]

[I 2025-12-05 18:34:09,772] Trial 9 finished with value: 0.75 and parameters: {'n_estimators': 270, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  22%|██▏       | 11/50 [00:40<02:24,  3.71s/it]

[I 2025-12-05 18:34:14,589] Trial 10 finished with value: 0.7625 and parameters: {'n_estimators': 500, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 1. Best value: 0.7625:  24%|██▍       | 12/50 [00:45<02:39,  4.20s/it]

[I 2025-12-05 18:34:19,904] Trial 11 finished with value: 0.7625 and parameters: {'n_estimators': 490, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.7625.


Best trial: 12. Best value: 0.775:  26%|██▌       | 13/50 [00:50<02:36,  4.22s/it]

[I 2025-12-05 18:34:24,176] Trial 12 finished with value: 0.775 and parameters: {'n_estimators': 440, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 12 with value: 0.775.


Best trial: 12. Best value: 0.775:  28%|██▊       | 14/50 [00:54<02:28,  4.14s/it]

[I 2025-12-05 18:34:28,127] Trial 13 finished with value: 0.7625 and parameters: {'n_estimators': 420, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 12 with value: 0.775.


Best trial: 12. Best value: 0.775:  30%|███       | 15/50 [00:56<02:01,  3.48s/it]

[I 2025-12-05 18:34:30,086] Trial 14 finished with value: 0.75 and parameters: {'n_estimators': 150, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 12 with value: 0.775.


Best trial: 12. Best value: 0.775:  32%|███▏      | 16/50 [01:00<02:06,  3.73s/it]

[I 2025-12-05 18:34:34,384] Trial 15 finished with value: 0.7375 and parameters: {'n_estimators': 360, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 12 with value: 0.775.


Best trial: 12. Best value: 0.775:  34%|███▍      | 17/50 [01:05<02:12,  4.01s/it]

[I 2025-12-05 18:34:39,047] Trial 16 finished with value: 0.7625 and parameters: {'n_estimators': 440, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 12 with value: 0.775.


Best trial: 17. Best value: 0.7875:  36%|███▌      | 18/50 [01:09<02:16,  4.28s/it]

[I 2025-12-05 18:34:43,950] Trial 17 finished with value: 0.7875 and parameters: {'n_estimators': 450, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  38%|███▊      | 19/50 [01:13<02:05,  4.06s/it]

[I 2025-12-05 18:34:47,500] Trial 18 finished with value: 0.7375 and parameters: {'n_estimators': 340, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  40%|████      | 20/50 [01:17<02:04,  4.16s/it]

[I 2025-12-05 18:34:51,891] Trial 19 finished with value: 0.7375 and parameters: {'n_estimators': 420, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  42%|████▏     | 21/50 [01:21<01:59,  4.13s/it]

[I 2025-12-05 18:34:55,946] Trial 20 finished with value: 0.7625 and parameters: {'n_estimators': 400, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  44%|████▍     | 22/50 [01:26<02:00,  4.31s/it]

[I 2025-12-05 18:35:00,668] Trial 21 finished with value: 0.7625 and parameters: {'n_estimators': 460, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  46%|████▌     | 23/50 [01:31<01:59,  4.43s/it]

[I 2025-12-05 18:35:05,392] Trial 22 finished with value: 0.775 and parameters: {'n_estimators': 460, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  48%|████▊     | 24/50 [01:36<01:57,  4.52s/it]

[I 2025-12-05 18:35:10,132] Trial 23 finished with value: 0.75 and parameters: {'n_estimators': 450, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  50%|█████     | 25/50 [01:41<01:57,  4.72s/it]

[I 2025-12-05 18:35:15,300] Trial 24 finished with value: 0.75 and parameters: {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  52%|█████▏    | 26/50 [01:45<01:46,  4.43s/it]

[I 2025-12-05 18:35:19,073] Trial 25 finished with value: 0.775 and parameters: {'n_estimators': 360, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  54%|█████▍    | 27/50 [01:49<01:40,  4.36s/it]

[I 2025-12-05 18:35:23,273] Trial 26 finished with value: 0.75 and parameters: {'n_estimators': 400, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  56%|█████▌    | 28/50 [01:54<01:39,  4.50s/it]

[I 2025-12-05 18:35:28,096] Trial 27 finished with value: 0.7625 and parameters: {'n_estimators': 460, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  58%|█████▊    | 29/50 [01:58<01:34,  4.48s/it]

[I 2025-12-05 18:35:32,523] Trial 28 finished with value: 0.7625 and parameters: {'n_estimators': 430, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  60%|██████    | 30/50 [02:03<01:32,  4.63s/it]

[I 2025-12-05 18:35:37,507] Trial 29 finished with value: 0.75 and parameters: {'n_estimators': 480, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  62%|██████▏   | 31/50 [02:06<01:21,  4.28s/it]

[I 2025-12-05 18:35:40,970] Trial 30 finished with value: 0.7625 and parameters: {'n_estimators': 320, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  64%|██████▍   | 32/50 [02:10<01:14,  4.12s/it]

[I 2025-12-05 18:35:44,720] Trial 31 finished with value: 0.775 and parameters: {'n_estimators': 360, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  66%|██████▌   | 33/50 [02:15<01:13,  4.32s/it]

[I 2025-12-05 18:35:49,488] Trial 32 finished with value: 0.775 and parameters: {'n_estimators': 470, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  68%|██████▊   | 34/50 [02:19<01:06,  4.14s/it]

[I 2025-12-05 18:35:53,217] Trial 33 finished with value: 0.75 and parameters: {'n_estimators': 360, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  70%|███████   | 35/50 [02:23<01:02,  4.16s/it]

[I 2025-12-05 18:35:57,444] Trial 34 finished with value: 0.7625 and parameters: {'n_estimators': 410, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  72%|███████▏  | 36/50 [02:28<01:00,  4.33s/it]

[I 2025-12-05 18:36:02,152] Trial 35 finished with value: 0.75 and parameters: {'n_estimators': 450, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  74%|███████▍  | 37/50 [02:31<00:51,  3.99s/it]

[I 2025-12-05 18:36:05,368] Trial 36 finished with value: 0.775 and parameters: {'n_estimators': 300, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  76%|███████▌  | 38/50 [02:35<00:47,  3.99s/it]

[I 2025-12-05 18:36:09,337] Trial 37 finished with value: 0.7375 and parameters: {'n_estimators': 380, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  78%|███████▊  | 39/50 [02:40<00:46,  4.27s/it]

[I 2025-12-05 18:36:14,259] Trial 38 finished with value: 0.775 and parameters: {'n_estimators': 480, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  80%|████████  | 40/50 [02:43<00:40,  4.02s/it]

[I 2025-12-05 18:36:17,687] Trial 39 finished with value: 0.75 and parameters: {'n_estimators': 340, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  82%|████████▏ | 41/50 [02:48<00:37,  4.16s/it]

[I 2025-12-05 18:36:22,184] Trial 40 finished with value: 0.75 and parameters: {'n_estimators': 440, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  84%|████████▍ | 42/50 [02:51<00:32,  4.03s/it]

[I 2025-12-05 18:36:25,919] Trial 41 finished with value: 0.775 and parameters: {'n_estimators': 360, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  86%|████████▌ | 43/50 [02:55<00:27,  3.93s/it]

[I 2025-12-05 18:36:29,600] Trial 42 finished with value: 0.775 and parameters: {'n_estimators': 380, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  88%|████████▊ | 44/50 [02:59<00:23,  3.84s/it]

[I 2025-12-05 18:36:33,247] Trial 43 finished with value: 0.7625 and parameters: {'n_estimators': 340, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  90%|█████████ | 45/50 [03:02<00:17,  3.56s/it]

[I 2025-12-05 18:36:36,154] Trial 44 finished with value: 0.75 and parameters: {'n_estimators': 270, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  92%|█████████▏| 46/50 [03:04<00:12,  3.19s/it]

[I 2025-12-05 18:36:38,471] Trial 45 finished with value: 0.7625 and parameters: {'n_estimators': 230, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  94%|█████████▍| 47/50 [03:08<00:09,  3.32s/it]

[I 2025-12-05 18:36:42,104] Trial 46 finished with value: 0.7625 and parameters: {'n_estimators': 370, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  96%|█████████▌| 48/50 [03:10<00:06,  3.18s/it]

[I 2025-12-05 18:36:44,957] Trial 47 finished with value: 0.725 and parameters: {'n_estimators': 290, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875:  98%|█████████▊| 49/50 [03:14<00:03,  3.44s/it]

[I 2025-12-05 18:36:48,995] Trial 48 finished with value: 0.7625 and parameters: {'n_estimators': 410, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 17 with value: 0.7875.


Best trial: 17. Best value: 0.7875: 100%|██████████| 50/50 [03:19<00:00,  3.98s/it]

[I 2025-12-05 18:36:53,142] Trial 49 finished with value: 0.7375 and parameters: {'n_estimators': 430, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 17 with value: 0.7875.





In [140]:
print(study.best_params)
print(study.best_value)

{'n_estimators': 450, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}
0.7875


In [141]:
updated_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**study.best_params, random_state=42))
])

In [142]:
updated_pipeline.fit(X_train, y_train)
y_pred = updated_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.65

In [143]:
import pickle 
pickle_model_path = 'insurance_model.pkl'
with open(pickle_model_path, 'wb') as f:
    pickle.dump(pipeline, f)

In [147]:
df_feat.head()

Unnamed: 0,age,income_lpa,BMI,smoker,occupation,Age Group,City Tier,Lifestyle Risk,insurance_premium_category
0,67,2.92,49.227482,False,retired,Senior Citizen,2,Low,High
1,36,34.28,30.189017,False,freelancer,Young Adult,1,Low,Low
2,39,36.64,21.118382,False,freelancer,Young Adult,2,Low,Low
3,22,3.34,45.5359,True,student,Young Adult,1,High,Medium
4,69,3.94,24.296875,True,retired,Senior Citizen,2,Low,High
