In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [17]:
df = pd.read_csv('insurance.csv')

In [18]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
118,66,104.1,1.78,2.6,False,Kathmandu,retired,High
101,27,56.1,1.58,10.2,False,Lalitpur,student,Low
41,47,85.5,1.75,20.0,False,Pokhara,private_job,Medium
82,22,58.5,1.55,4.3,False,Kathmandu,student,Medium
79,44,66.3,1.7,38.6,True,Pokhara,private_job,High


In [19]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'private_job', 'unemployed'], dtype=object)

In [20]:
df_feat = df.copy()

In [21]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [22]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [23]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [24]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [25]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [26]:
tier_1_cities = [
    "Kathmandu", "Pokhara", "Lalitpur", "Bhaktapur", "Biratnagar", "Birgunj", "Bharatpur"
]

tier_2_cities = [
    "Dharan", "Janakpur", "Butwal", "Hetauda", "Nepalgunj", "Dhangadhi",
    "Itahari", "Bhairahawa", "Tulsipur", "Gaur", "Inaruwa", "Rajbiraj", "Ilam",
    "Damak", "Bardibas", "Kalaiya", "Banepa", "Dipayal", "Lahan", "Bhadrapur",
    "Kakarbhitta", "Parasi", "Gulariya", "Kohalpur", "Kapilvastu", "Birtamode"
]

tier_3_cities = [
    "Tikapur", "Baglung", "Tansen", "Gorkha", "Beni", "Khotang", "Jaleshwar",
    "Dhankuta", "Sindhuli", "Phidim", "Panauti", "Melamchi", "Khandbari",
    "Waling", "Amargadhi", "Resunga", "Besisahar", "Chainpur", "Bayalpata",
    "Okhaldhunga", "Sandhikharka", "Darchula", "Manma", "Salleri", "Damakha",
    "Rukumkot", "Jumla", "Martadi", "Triveni", "Putalibazar"
]


In [27]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [28]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [29]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
2,36.64,freelancer,21.118382,adult,low,2,Low
117,11.8,student,21.953125,adult,medium,1,Low
62,6.5,student,23.994552,young,low,1,Low
71,29.3,business_owner,27.755102,middle_aged,medium,1,High
105,2.8,retired,32.363241,middle_aged,high,1,High


In [30]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [31]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,1,2.92,retired
1,30.189017,adult,medium,1,34.28,freelancer
2,21.118382,adult,low,2,36.64,freelancer
3,45.535900,young,high,2,3.34,student
4,24.296875,senior,medium,1,3.94,retired
...,...,...,...,...,...,...
115,24.859074,adult,medium,1,29.50,private_job
116,27.854671,middle_aged,medium,2,40.00,business_owner
117,21.953125,adult,medium,1,11.80,student
118,32.855700,senior,medium,1,2.60,retired


In [32]:
y

0        High
1         Low
2         Low
3      Medium
4        High
        ...  
115    Medium
116    Medium
117       Low
118      High
119    Medium
Name: insurance_premium_category, Length: 120, dtype: object

In [33]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [34]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [35]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [36]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


In [37]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.7083333333333334

In [38]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
92,23.164062,adult,medium,2,29.0,freelancer
99,22.206331,adult,medium,1,15.3,student
48,23.425606,adult,low,2,33.4,freelancer
103,24.978741,adult,low,1,35.0,business_owner
83,26.254252,adult,low,1,30.2,unemployed


In [39]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)
