Load data

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("/content/insurance_premium_dataset_50000_records.csv")
print(df.shape)


(50000, 33)


Separate feature groups (for interpretability)

In [24]:
target = "insurance_premium"

geo_features = [
    "latitude","longitude","distance_from_coast_km","elevation_meters"
]

environmental_features = [
    "avg_aqi","water_quality_index","climate_change_risk_score",
    "heatwave_days_per_year","natural_disaster_risk",
    "urban_flood_risk","terrain_type"
]

infrastructure_features = [
    "road_condition_index","drainage_quality_index",
    "industrial_exposure_index","traffic_density_index",
    "green_cover_percent"
]

health_features = [
    "respiratory_disease_rate","waterborne_disease_rate",
    "vector_borne_disease_rate"
]

personal_features = [
    "age","gender","annual_income_lakh","occupation_risk_level",
    "smoker","alcohol_consumption","pre_existing_conditions",
    "physical_activity_level","family_medical_history",
    "health_checkup_frequency"
]

categorical_cols = [
    "city_name","locality_type","terrain_type",
    "natural_disaster_risk","urban_flood_risk",
    "gender","occupation_risk_level",
    "alcohol_consumption","physical_activity_level",
    "health_checkup_frequency"
]


Encode categorical variables (Label Encoding)

In [25]:
from sklearn.preprocessing import LabelEncoder

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


Correlation with target (filter weak features)

In [26]:
corr = df.corr(numeric_only=True)[target].sort_values(ascending=False)
corr


Unnamed: 0,insurance_premium
insurance_premium,1.0
pre_existing_conditions,0.560214
smoker,0.441626
annual_income_lakh,0.346877
respiratory_disease_rate,0.298904
age,0.291864
industrial_exposure_index,0.285614
vector_borne_disease_rate,0.178085
avg_aqi,0.170943
traffic_density_index,0.130623


Drop extremely weak predictors

In [27]:
selected_features = corr[abs(corr) > 0.05].index.tolist()
selected_features.remove(target)

len(selected_features), selected_features


(11,
 ['pre_existing_conditions',
  'smoker',
  'annual_income_lakh',
  'respiratory_disease_rate',
  'age',
  'industrial_exposure_index',
  'vector_borne_disease_rate',
  'avg_aqi',
  'traffic_density_index',
  'waterborne_disease_rate',
  'climate_change_risk_score'])

Multicollinearity check (VIF)

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = df[selected_features]

vif_df = pd.DataFrame()
vif_df["feature"] = X_vif.columns
vif_df["VIF"] = [
    variance_inflation_factor(X_vif.values, i)
    for i in range(X_vif.shape[1])
]

vif_df.sort_values("VIF", ascending=False)


Unnamed: 0,feature,VIF
7,avg_aqi,10.787529
10,climate_change_risk_score,8.403737
4,age,7.354902
9,waterborne_disease_rate,5.554586
3,respiratory_disease_rate,5.472989
8,traffic_density_index,5.435706
6,vector_borne_disease_rate,5.024603
2,annual_income_lakh,3.931827
5,industrial_exposure_index,3.40406
1,smoker,1.421195


Final feature set

In [29]:
X = df[selected_features]
y = df[target]

print("Final feature count:", X.shape[1])


Final feature count: 11
