In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [None]:
df = pd.read_csv('/content/insurance.csv')

In [None]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
32,47,113.7,1.9,50.0,False,Jalandhar,private_job,Medium
60,41,101.3,1.81,49.94,True,Jalandhar,unemployed,High
22,57,106.4,1.83,30.0,False,Chandigarh,government_job,Low
56,24,101.9,1.55,2.86,True,Kolkata,student,Medium
94,50,105.4,1.78,10.542289,False,Bangalore,government_job,Low


In [None]:
df_fead = df.copy()
df_fead.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [None]:
df_fead['occupation'].unique()

NameError: name 'df_fead' is not defined

In [None]:
df_fead['bmi'] = df_fead['weight']/(df_fead['height']**2)

In [None]:
#feature 2: Age group
def age_group(age):
  if age < 25:
    return "young"
  elif age < 45:
    return "adult"
  elif age < 60:
    return "middle_aged"
  return "senior"


In [None]:
df_fead['age_group'] = df_fead['age'].apply(age_group)

In [None]:
#Feature 3: LifeStyle Risk
def lifestyle_risk(row):
  if row['smoker'] and row['bmi'] > 30:
    return 'high'
  elif row['smoker'] and row['bmi'] > 27:
    return 'medium'
  else:
    return "low"


In [None]:
df_fead['lifestyle_risk'] = df_fead.apply(lifestyle_risk, axis=1)

In [None]:
df_fead['city'].unique()

array(['Jaipur', 'Chennai', 'Indore', 'Mumbai', 'Kota', 'Hyderabad',
       'Delhi', 'Chandigarh', 'Pune', 'Kolkata', 'Lucknow', 'Gaya',
       'Jalandhar', 'Mysore', 'Bangalore'], dtype=object)

In [None]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"]

In [None]:
#Feature 4: City Tier
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3


In [None]:
df_fead['city_tier'] = df_fead['city'].apply(city_tier)

In [None]:
df_fead.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
52,2.96,student,47.34472,young,low,2,Medium
90,21.07,business_owner,21.09375,middle_aged,low,1,Low
51,28.95,private_job,38.827923,middle_aged,high,2,High
81,22.19,freelancer,31.866055,adult,high,2,High
26,23.61,freelancer,30.477219,adult,low,2,Medium


In [None]:
X= df_fead[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df_fead['insurance_premium_category']

In [None]:
categorical_cols = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numerical_cols = ['bmi', 'income_lpa']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [None]:
pipeline = Pipeline(steps = [('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
accuracy_score(y_test, y_pred)

0.85

In [None]:
x_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
31,15.258742,adult,low,2,11.77,private_job
92,18.319942,adult,low,2,30.0,government_job
32,31.495845,middle_aged,low,2,50.0,private_job
39,35.643424,middle_aged,high,1,11.99,unemployed
78,27.932798,middle_aged,medium,2,14.74,freelancer



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
import pickle

pickle_model_path = "model.pkl"
with open(pickle_model_path, 'wb') as f:
    pickle.dump(pipeline, f)