In [84]:
# Importing necessary libarary

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,accuracy_score

In [85]:
df = pd.read_csv('/content/insurance.csv')

In [86]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
40,44,57.0,1.53,40.19,True,Pune,unemployed,Medium
44,59,77.0,1.6,50.0,True,Lucknow,private_job,Medium
84,75,86.2,1.73,0.62,True,Jaipur,retired,High
85,33,51.4,1.86,34.66,False,Chennai,private_job,Low
20,34,58.2,1.85,30.65,True,Gaya,business_owner,Medium


In [87]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [88]:
df['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
False,57
True,43


In [89]:
df['insurance_premium_category'].value_counts()

Unnamed: 0_level_0,count
insurance_premium_category,Unnamed: 1_level_1
Low,34
High,33
Medium,33


In [90]:
df_feature = df.copy()

In [91]:
# Creating new features from existing features

# feature1 : BMI

df_feature['bmi'] = df_feature['weight'] / (df_feature['height']** 2)

In [92]:
# feature2: Age Group

def age_group(age):
  if age < 25:
    return 'young'
  elif age < 45:
    return 'adult'
  elif age < 60:
    return 'middled_aged'
  else:
    return 'senior'

df_feature['age_group'] = df_feature['age'].apply(age_group)

In [93]:
# feature3: Lifestyle Risk

def lifestyle_risk(row):
  if row['smoker'] and row['bmi'] > 30:
    return 'high'
  elif row['smoker'] or row['bmi'] > 27:
    return 'medium'
  else:
    return 'low'

In [94]:
df_feature['lifestyle_risk'] = df_feature.apply(lifestyle_risk,axis=1)

In [95]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [96]:
#  feature4: City Tier

def city_tier(city):
  if city in tier_1_cities:
    return 'tier_1'
  elif city in tier_2_cities:
    return 'tier_2'
  else:
    return 'tier_3'

In [97]:
df_feature['city_tier'] = df_feature['city'].apply(city_tier)

In [98]:
df_feature = df_feature.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']]
df_feature

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92000,retired,49.227482,senior,medium,tier_2,High
1,34.28000,freelancer,30.189017,adult,medium,tier_1,Low
2,36.64000,freelancer,21.118382,adult,low,tier_2,Low
3,3.34000,student,45.535900,young,high,tier_1,Medium
4,3.94000,retired,24.296875,senior,medium,tier_2,High
...,...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,tier_2,Low
96,34.01000,private_job,47.984483,adult,medium,tier_1,Low
97,44.86000,freelancer,18.765432,middled_aged,low,tier_1,Low
98,28.30000,business_owner,30.521676,adult,medium,tier_1,Low


In [99]:
df_feature[df_feature['bmi']>30]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92,retired,49.227482,senior,medium,tier_2,High
1,34.28,freelancer,30.189017,adult,medium,tier_1,Low
3,3.34,student,45.5359,young,high,tier_1,Medium
7,10.865821,government_job,33.360687,adult,high,tier_1,Medium
11,10.899387,government_job,31.722551,adult,high,tier_1,Low
12,17.58,freelancer,30.046711,adult,high,tier_2,High
14,13.505166,government_job,32.800735,middled_aged,medium,tier_3,Medium
17,2.23,retired,31.176471,senior,medium,tier_1,Medium
19,2.79,student,43.4375,young,high,tier_2,High
22,30.0,government_job,31.771627,middled_aged,medium,tier_2,Low


In [100]:
# selecting feature and target value

x = df_feature.drop(columns=['insurance_premium_category'])
y = df_feature['insurance_premium_category']

In [101]:
x

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,49.227482,senior,medium,tier_2
1,34.28000,freelancer,30.189017,adult,medium,tier_1
2,36.64000,freelancer,21.118382,adult,low,tier_2
3,3.34000,student,45.535900,young,high,tier_1
4,3.94000,retired,24.296875,senior,medium,tier_2
...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,tier_2
96,34.01000,private_job,47.984483,adult,medium,tier_1
97,44.86000,freelancer,18.765432,middled_aged,low,tier_1
98,28.30000,business_owner,30.521676,adult,medium,tier_1


In [102]:
x['lifestyle_risk'].value_counts()

Unnamed: 0_level_0,count
lifestyle_risk,Unnamed: 1_level_1
medium,54
low,26
high,20


In [103]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [104]:
# defining categorical and numerical features

categorical_features = ['occupation', 'age_group', 'lifestyle_risk', 'city_tier']
numerical_features = ['income_lpa', 'bmi']

In [105]:
# creating column transformer for OneHotEncoding

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])


In [106]:
# Create a pipeline with preprocessing and random forest classifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [107]:
# splitting the data and training model

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train,y_train)

In [108]:
y_pred = pipeline.predict(x_test)

In [109]:
accuracy_score(y_test,y_pred)

0.85

In [110]:
x_test.sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
56,2.86,student,42.414152,young,high,tier_1
32,50.0,private_job,31.495845,middled_aged,medium,tier_2
51,28.95,private_job,38.827923,middled_aged,high,tier_2
69,6.034487,government_job,21.942857,middled_aged,low,tier_2
81,22.19,freelancer,31.866055,adult,high,tier_2


In [111]:
import pickle

In [112]:
# saving the trained model using pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)