In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

print("Libraries loaded")


Libraries loaded


In [2]:
df = pd.read_csv('../data/insurance_clean.csv')

df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,BMI_Level,Risk_Level,Risk_Profile
0,19,female,27.9,0,yes,southwest,16884.924,Overweight,High,High
1,18,male,33.77,1,no,southeast,1725.5523,Obese,Low,Medium
2,28,male,33.0,3,no,southeast,4449.462,Obese,Low,Medium
3,33,male,22.705,0,no,northwest,21984.47061,Normal,High,Low
4,32,male,28.88,0,no,northwest,3866.8552,Overweight,Low,Low


In [3]:
X = df.drop(columns=["Risk_Level"])
y = df["Risk_Level"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (1338, 9)
y shape: (1338,)


In [4]:
numerical_features = ["age", "bmi", "children"]
categorical_features = ["sex", "smoker", "region", "BMI_Level"]

print("Numerical:", numerical_features)
print("Categorical:", categorical_features)


Numerical: ['age', 'bmi', 'children']
Categorical: ['sex', 'smoker', 'region', 'BMI_Level']


In [5]:
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(
    drop="first",
    handle_unknown="ignore"
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

print("Preprocessing pipeline ready")


Preprocessing pipeline ready


In [6]:
X_processed = preprocessor.fit_transform(X)

print("Shape after preprocessing:", X_processed.shape)


Shape after preprocessing: (1338, 10)


In [7]:
cat_feature_names = preprocessor.named_transformers_["cat"]\
    .get_feature_names_out(categorical_features)

feature_names = numerical_features + list(cat_feature_names)

print("Total features:", len(feature_names))
feature_names


Total features: 10


['age',
 'bmi',
 'children',
 'sex_male',
 'smoker_yes',
 'region_northwest',
 'region_southeast',
 'region_southwest',
 'BMI_Level_Obese',
 'BMI_Level_Overweight']

In [8]:
X_processed_df = pd.DataFrame(
    X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
    columns=feature_names
)

X_processed_df.head()


Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,BMI_Level_Obese,BMI_Level_Overweight
0,-1.438764,-0.45332,-0.908614,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,-1.509965,0.509621,-0.078767,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.797954,0.383307,1.580926,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.441948,-1.305531,-0.908614,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.513149,-0.292556,-0.908614,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [9]:
print("Missing values:", X_processed_df.isnull().sum().sum())
print("Final feature shape:", X_processed_df.shape)


Missing values: 0
Final feature shape: (1338, 10)


In [10]:
X_processed_df["Risk_Level"] = y.values

X_processed_df.to_csv('../data/insurance_ml_ready.csv', index=False)

print("ML-ready dataset saved")


ML-ready dataset saved
