In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


#### here feature engineering is done and some columns are transformed and new columns are made for building a better model

##### like age_group is made from age, city_tier is made from city, bmi is made from height and weight, and lifestyle risk is made from bmi and smoker columns

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# making a copy of dataframe so that changes made won't be reflected in actual df
df_feature = df.copy()

Calculating BMI

In [5]:
df_feature["bmi"] = round(df_feature["weight"] / (df_feature["height"] ** 2), 2)

Calculating age group from age column

In [6]:
def age_group(age):
    if age < 25:
        return "Young"
    elif age < 45:
        return "Adult"
    elif age < 60:
        return "Middle Age"
    return "senior"

In [7]:
# Applies the age_group() function to each value in the 'age' column and assigns the results to a new 'age_group' column
df_feature["age_group"] = df_feature["age"].apply(age_group)

In [8]:
df_feature.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.23,senior
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.19,Adult
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.12,Adult
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.54,Young
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.3,senior


In [9]:
df_feature.columns

Index(['age', 'weight', 'height', 'income_lpa', 'smoker', 'city', 'occupation',
       'insurance_premium_category', 'bmi', 'age_group'],
      dtype='object')

In [10]:
df_feature.drop(columns="age")

Unnamed: 0,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,119.8,1.56,2.92000,False,Jaipur,retired,High,49.23,senior
1,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.19,Adult
2,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.12,Adult
3,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.54,Young
4,62.2,1.60,3.94000,True,Indore,retired,High,24.30,senior
...,...,...,...,...,...,...,...,...,...
95,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.42,Adult
96,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.98,Adult
97,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.77,Middle Age
98,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.52,Adult


### Creating lifestyle risk column from bmi and smoker columns

When you use .apply() on a DataFrame with axis=1, each row passed into the function is a pandas Series representing one row of the DataFrame. This row behaves like a dictionary where keys are column names (strings).

So, row["smoker"] accesses the value in the "smoker" column for that particular row. If "smoker" column contains boolean values (True or False), 

then: row["smoker"] will be True or False depending on that person’s smoking status.

In [11]:
# 'row' is a Series representing a single DataFrame row; row["smoker"] accesses the 'smoker' column value in that row
# Since 'smoker' contains True/False, row["smoker"] evaluates as a boolean in conditional checks
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

1. For age_group you had:

df_feature["age_group"] = df_feature["age"].apply(age_group)
Here, .apply() is called on a single column (df_feature["age"]), which is a Series. The age_group function takes a single value (each age) as input.
So, apply() applies the function element-wise to each value in the "age" column.

2. For lifestyle_risk you have:

df_feature["lifestyle_risk"] = df_feature.apply(lifestyle_risk, axis=1)
Here, .apply() is called on the whole DataFrame (df_feature). axis=1 means the function is applied row-wise. The lifestyle_risk function expects an entire row (a Series representing that row), so it can access multiple columns (like "smoker" and "bmi"). So, apply() passes each row to lifestyle_risk.

In [12]:
df_feature["lifestyle_risk"] = df_feature.apply(lifestyle_risk, axis=1)

Creating city tier from given column of city

In [13]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [14]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [15]:
df_feature["city_tier"] = df_feature["city"].apply(city_tier)

In [16]:
df_feature.columns

Index(['age', 'weight', 'height', 'income_lpa', 'smoker', 'city', 'occupation',
       'insurance_premium_category', 'bmi', 'age_group', 'lifestyle_risk',
       'city_tier'],
      dtype='object')

In [17]:
df_feature.drop(columns=["weight", "height", "city", "smoker"], axis=1, inplace=True)

In [18]:
df_feature.drop(columns="age", axis=1, inplace=True)

In [19]:
df_feature.head()

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,2.92,retired,High,49.23,senior,medium,2
1,34.28,freelancer,Low,30.19,Adult,medium,1
2,36.64,freelancer,Low,21.12,Adult,low,2
3,3.34,student,Medium,45.54,Young,high,1
4,3.94,retired,High,24.3,senior,medium,2


Separating features and target

In [20]:
X = df_feature[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
Y = df_feature["insurance_premium_category"]

Separating categorical features and numerical features

In [21]:
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numerical_features = ["bmi", "income_lpa"]

Creating column transformer for One Hot Encoding

In [22]:
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features), # one hot encoding is applied on all categorical features
    ("num", "passthrough", numerical_features)
])

Creating a pipeline with processing and Random Forest Classifier

In [23]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

split into train and test data

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
pipeline.fit(X_train, Y_train)

In [25]:
Y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.85

In [26]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
51,38.83,Middle Age,high,2,28.95,private_job
84,28.8,senior,medium,2,0.62,retired
81,31.87,Adult,high,2,22.19,freelancer
10,22.95,Adult,medium,1,32.78,business_owner
32,31.5,Middle Age,medium,2,50.0,private_job


In [27]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)