In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score


In [25]:
df= pd.read_csv("insurance.csv")

In [26]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
81,41,82.6,1.61,22.19,True,Mysore,freelancer,High
17,65,90.1,1.7,2.23,False,Delhi,retired,Medium
48,36,94.8,1.66,32.69,True,Chennai,unemployed,Medium
60,41,101.3,1.81,49.94,True,Jalandhar,unemployed,High
26,33,79.0,1.61,23.61,False,Jaipur,freelancer,Medium


In [27]:
# copying the dataset so that we can feature engineer more on that
df_feat = df.copy()

# feature engineering

In [28]:
# feature 1:
df_feat["BMI"] = df_feat["weight"]/(df["height"]**2)

In [29]:
# feature 2:
def age_group(age):
  if age<25:
    return "young"
  elif age<45:
    return "adult"
  elif age<60:
    return "middle_aged"
  return "senior"

In [30]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

# creatign lifestyle risk feature:

In [31]:
# feature 3: Lifestyle risk
def lifestyle_risk(row):
  if row["smoker"] and row["BMI"] > 30:
    return "high"
  elif row["smoker"] and row["BMI"] > 27:
    return "medium"
  else:
    return "low"

In [32]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [33]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [34]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


In [35]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [36]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'BMI', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,BMI,age_group,lifestyle_risk,city_tier,insurance_premium_category
10,32.78,business_owner,22.949982,adult,low,1,Medium
43,1.56,retired,29.308163,senior,low,1,Medium
19,2.79,student,43.4375,young,high,2,High
55,24.93,unemployed,25.293194,middle_aged,low,1,Low
31,11.77,private_job,15.258742,adult,low,2,Medium


In [37]:
# select features and target
X = df_feat[["BMI", "age_group","lifestyle_risk", "city_tier", "income_lpa","occupation"]]
y = df_feat["insurance_premium_category"]

In [38]:
X

Unnamed: 0,BMI,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,low,2,2.92000,retired
1,30.189017,adult,low,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,low,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,low,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,low,1,28.30000,business_owner


In [39]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [40]:
# defining the categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "city_tier", "occupation"]
numeric_features = ["BMI", "income_lpa"]

In [41]:
# create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [42]:
# creating a pipeline with preprocessing and random forest calssifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [43]:
# split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [44]:
# predict and evalute
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.85

# importing the model as pickle:


In [45]:
import pickle
# save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
  pickle.dump(pipeline, f)