In [1]:
# ============================
# ML Pipeline with Feature Engineering and Model Save
# ============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# ----------------------------
# 1️⃣ Original DataFrame (from image)
# ----------------------------
data = {
    'age': [64, 51, 67, 60, 40],
    'weight': [59.8, 100.6, 114.5, 117.8, 70.0],
    'height': [1.63, 1.68, 1.74, 1.66, 1.59],
    'income_lpa': [3.87, 11.99, 0.61, 50.0, 28.16664],
    'smoker': [False, True, True, True, True],
    'city': ['Mumbai', 'Bangalore', 'Mumbai', 'Lucknow', 'Bangalore'],
    'occupation': ['retired', 'unemployed', 'retired', 'business_owner', 'government_job'],
    'insurance_premium_category': ['Medium', 'High', 'High', 'High', 'Low']
}

df = pd.DataFrame(data)
n_samples = 5000

# Randomly sample with replacement
df_expanded = df.sample(n=n_samples, replace=True, random_state=42).reset_index(drop=True)

# -----------------------------
# Add some small noise to numeric columns for realism
# -----------------------------
np.random.seed(42)
df_expanded['age'] += np.random.randint(-2, 3, size=n_samples)      # +/- 2 years
df_expanded['weight'] += np.random.uniform(-3, 3, size=n_samples)   # +/- 3 kg
df_expanded['height'] += np.random.uniform(-0.05, 0.05, size=n_samples) # +/- 5 cm
df_expanded['income_lpa'] += np.random.uniform(-2, 2, size=n_samples)  # +/- 2 LPA



In [2]:
# ----------------------------
# 2️⃣ Feature Engineering
# ----------------------------

# BMI = weight / (height^2)
df['bmi'] = df['weight'] / (df['height'] ** 2)

# Age group
def age_group(age):
    if age < 18 : 
        return 'child'
    elif age < 25:
        return 'young'
    elif age < 35:
        return 'middle_aged'
    else : 
        return 'senior'

df['age_group'] = df['age'].apply(age_group)

# Lifestyle risk based on BMI + smoker
def lifestyle_risk(row):
    if row['smoker'] and row['bmi'] > 30:
        return 'high'
    elif row['smoker'] or row['bmi'] > 27:
        return 'medium'
    else:
        return 'low'
df['lifestyle_risk'] = df.apply(lifestyle_risk, axis=1)

# City tier mapping
city_tier_map = {'Mumbai':2, 'Bangalore':2, 'Lucknow':1}
df['city_tier'] = df['city'].map(city_tier_map)

# Select features for ML
X = df[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df['insurance_premium_category']

In [3]:


# ----------------------------
# 3️⃣ Preprocessing
# ----------------------------
numeric_features = ['bmi', 'income_lpa', 'city_tier']
categorical_features = ['age_group', 'lifestyle_risk', 'occupation']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [4]:

# ----------------------------
# 4️⃣ ML Pipeline
# ----------------------------
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# ----------------------------
# 5️⃣ Train/Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# 6️⃣ Train Model
# ----------------------------
pipeline.fit(X_train, y_train)

# ----------------------------
# 7️⃣ Evaluate Model
# ----------------------------
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------------------------
# 8️⃣ Save Model using pickle
# ----------------------------
with open('model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Model saved successfully as 'model.pkl'")


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Model saved successfully as 'model.pkl'


