In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df=pd.read_csv("/content/insurance.csv")

In [None]:
df.head(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [None]:
df_feat = df.copy()

In [None]:
df_feat['bmi'] = df_feat['weight']/(df_feat["height"]**2)

In [None]:
def age_group(age):
  if age < 25:
    return "young"
  elif age < 45:
    return "adult"
  elif age < 65:
    return "middle_aged"
  else:
    return "senior"


In [None]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [None]:
def lifestyle_risk(row):
  if row["smoker"] and row["bmi"] > 20:
    return "high"
  elif row["smoker"] and row["bmi"] > 27:
    return "medium"
  else:
    return "low"

In [None]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [None]:
tier_1_cities = ["Mumbai", "Delhi", "Banglore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = ["Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Vishakhapatnam", "Sasaram", "Dehri",
                 "Bhopal", "Nagpur", "Vadodra", "Surat", "Rajkot", "Jodhpur", "Varanasi", "Raipur", "Amritsar", "Agra",
                 "Dehradun", "Mysore", "jabalpur", "Guwahati", "Thiruvanathapuram", "Ludhiana", "Nasik", "Allahabad", "Udaipur",
                 "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijaywada", "Tiruchirappalli", "Bhavnagar","Gawalior","Dhanbad","Barailey",
                 "Aligarh", "Gaya","Kozikhodo", "Warangal", "Kolhapur", "Belaspur", "Jalandhar", "Guntur", "Asansol", "Siliguri"]

In [None]:
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3

In [None]:
df_feat['city_tier'] = df_feat["city"].apply(city_tier)

In [None]:
df_feat.drop(columns=['age','weight', 'height', 'smoker','city'])[['income_lpa', 'occupation', "bmi", 'age_group','lifestyle_risk', 'city_tier','insurance_premium_category']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92000,retired,49.227482,senior,low,2,High
1,34.28000,freelancer,30.189017,adult,low,1,Low
2,36.64000,freelancer,21.118382,adult,low,2,Low
3,3.34000,student,45.535900,young,high,1,Medium
4,3.94000,retired,24.296875,senior,high,2,High
...,...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,2,Low
96,34.01000,private_job,47.984483,adult,low,1,Low
97,44.86000,freelancer,18.765432,middle_aged,low,1,Low
98,28.30000,business_owner,30.521676,adult,low,1,Low


In [None]:
X=df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier','income_lpa','occupation']]
y=df_feat["insurance_premium_category"]

In [None]:
categorical_features=["age_group", "lifestyle_risk","occupation", "city_tier"]
numerical_features=["bmi", "income_lpa"]

In [None]:
preprocessor=ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), categorical_features),
    ('num', 'passthrough', numerical_features)
])



In [None]:

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred=pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.7

In [None]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
73,32.121628,senior,high,1,2.22,retired
76,44.044444,middle_aged,low,2,1.12,retired
22,31.771627,middle_aged,low,2,30.0,government_job
4,24.296875,senior,high,2,3.94,retired
31,15.258742,adult,low,2,11.77,private_job


In [None]:
import pickle
pickle_model_path="model.pkl"
with open(pickle_model_path, 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
df_feat['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)