# 🧠 AI Risk Assessment & Explainability Framework – Step 1: EDA + Preprocessing

This notebook performs:
- Data loading and initial exploration (EDA)
- Missing value and class balance analysis
- Identification of numeric, categorical, and date features
- Preprocessing pipeline construction using `ColumnTransformer`
- Train/test split (stratified)
- Export of cleaned data and preprocessing object

**Dataset:** `insuranceclaimsdata.csv`
**Goal:** Predict likelihood of insurance claim occurrence (`claim_flag` = 1 means a claim was made)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Optional: category encoders if you prefer target or ordinal encoding
# import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier


In [10]:
# Load dataset
df = pd.read_csv("../data/Insurance_claims_data.csv")

print("Shape:", df.shape)
df.head()


Shape: (58592, 41)


Unnamed: 0,policy_id,subscription_length,vehicle_age,customer_age,region_code,region_density,segment,model,fuel_type,max_torque,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,claim_status
0,POL045360,9.3,1.2,41,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0
1,POL016745,8.2,1.8,35,C2,27003,C1,M9,Diesel,200Nm@1750rpm,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,4,0
2,POL007194,9.5,0.2,44,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0
3,POL018146,5.2,0.4,44,C10,73430,A,M1,CNG,60Nm@3500rpm,...,No,No,No,Yes,No,No,No,Yes,0,0
4,POL049011,10.1,1.0,56,C13,5410,B2,M5,Diesel,200Nm@3000rpm,...,No,Yes,Yes,Yes,No,No,Yes,Yes,5,0


In [12]:
df.info()
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58592 entries, 0 to 58591
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_id                         58592 non-null  object 
 1   subscription_length               58592 non-null  float64
 2   vehicle_age                       58592 non-null  float64
 3   customer_age                      58592 non-null  int64  
 4   region_code                       58592 non-null  object 
 5   region_density                    58592 non-null  int64  
 6   segment                           58592 non-null  object 
 7   model                             58592 non-null  object 
 8   fuel_type                         58592 non-null  object 
 9   max_torque                        58592 non-null  object 
 10  max_power                         58592 non-null  object 
 11  engine_type                       58592 non-null  object 
 12  airb

In [14]:
import re

# Drop unique identifiers
df = df.drop(columns=["policy_id"], errors="ignore")

# Convert Yes/No columns to binary using .replace
yes_no_cols = df.columns[df.isin(["Yes", "No"]).any()]
df[yes_no_cols] = df[yes_no_cols].replace({"Yes": 1, "No": 0})

# Extract numeric torque value from 'max_torque' using .str.extract
if "max_torque" in df.columns:
    df["max_torque"] = df["max_torque"].astype(str).str.extract(r"(\d+)").astype(float)

print("✅ Cleaned Yes/No columns and extracted numeric torque values.")
df.head(3)


✅ Cleaned Yes/No columns and extracted numeric torque values.


Unnamed: 0,subscription_length,vehicle_age,customer_age,region_code,region_density,segment,model,fuel_type,max_torque,max_power,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,claim_status
0,9.3,1.2,41,C8,8794,C2,M4,Diesel,250.0,113.45bhp@4000rpm,...,1,1,1,1,1,0,1,1,3,0
1,8.2,1.8,35,C2,27003,C1,M9,Diesel,200.0,97.89bhp@3600rpm,...,0,1,1,1,1,1,1,1,4,0
2,9.5,0.2,44,C8,8794,C2,M4,Diesel,250.0,113.45bhp@4000rpm,...,1,1,1,1,1,0,1,1,3,0


In [16]:

# Missing values
missing = df.isnull().mean().sort_values(ascending=False)
print("\nMissing columns (>0):")
print(missing[missing > 0].head(20))



Missing columns (>0):
Series([], dtype: float64)


In [25]:
# Define target variable
target_col = "claim_status"

# Check class distribution
print("Class distribution:\n", df[target_col].value_counts())
print("\nClass proportion (%):\n", df[target_col].value_counts(normalize=True) * 100)


Class distribution:
 claim_status
0    54844
1     3748
Name: count, dtype: int64

Class proportion (%):
 claim_status
0    93.603222
1     6.396778
Name: proportion, dtype: float64


In [33]:
# Ensure target column is not included in feature lists
target_col = "claim_status"
num_cols = [col for col in df.select_dtypes(include=["int64", "float64"]).columns if col != target_col]
cat_cols = [col for col in df.select_dtypes(include=["object", "category"]).columns if col != target_col]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)
# Split data into features and target

Numeric columns: ['subscription_length', 'vehicle_age', 'customer_age', 'region_density', 'max_torque', 'airbags', 'is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors', 'is_parking_camera', 'displacement', 'cylinder', 'turning_radius', 'length', 'width', 'gross_weight', 'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer', 'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering', 'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror', 'is_ecw', 'is_speed_alert', 'ncap_rating']
Categorical columns: ['region_code', 'segment', 'model', 'fuel_type', 'max_power', 'engine_type', 'rear_brakes_type', 'transmission_type', 'steering_type']


In [39]:
# Preprocessing pipelines
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

print("Preprocessor ready ✅", cat_cols, num_cols)


Preprocessor ready ✅ ['region_code', 'segment', 'model', 'fuel_type', 'max_power', 'engine_type', 'rear_brakes_type', 'transmission_type', 'steering_type'] ['subscription_length', 'vehicle_age', 'customer_age', 'region_density', 'max_torque', 'airbags', 'is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors', 'is_parking_camera', 'displacement', 'cylinder', 'turning_radius', 'length', 'width', 'gross_weight', 'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer', 'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering', 'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror', 'is_ecw', 'is_speed_alert', 'ncap_rating']


In [36]:
#check if Dataset is balanced or not
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

print("Class counts:\n", df["claim_status"].value_counts())
print("\nClass proportions (%):\n", df["claim_status"].value_counts(normalize=True) * 100)

Train size: (46873, 39) Test size: (11719, 39)
Class counts:
 claim_status
0    54844
1     3748
Name: count, dtype: int64

Class proportions (%):
 claim_status
0    93.603222
1     6.396778
Name: proportion, dtype: float64


In [42]:
# Save processed data
X_train.to_csv("../data/X_train_processed.csv", index=False)
X_test.to_csv("../data/X_test_processed.csv", index=False)
y_train.to_csv("../data/y_train_processed.csv", index=False)
y_test.to_csv("../data/y_test_processed.csv", index=False)