In [19]:
!pip install -r requirement.txt



In [20]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

### 1. Loading and Exploring data

In [21]:
df = pd.read_csv('data/processed/aOvRUrcRLE.csv')
df.head()

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,3.7612,2,0.0,1,1,1,101348.88,1,Fair
1,Spain,Female,3.73767,1,83807.86,1,0,1,112542.58,0,Fair
2,France,Female,3.7612,8,159660.8,3,1,0,113931.57,1,Poor
3,France,Female,3.68668,1,0.0,2,0,0,93826.63,0,Good
4,Spain,Female,3.78419,2,125510.82,1,1,1,79084.1,0,Excellent


### 2. Build Scikit-learn Pipelines

In [None]:
remainder_features = ['NumOfProducts','HasCrCard','IsActiveMember','Exited']
numerical_features =['Age','Tenure','Balance','EstimatedSalary']
nominal_features = ['Gender', 'Geography']
ordinal_features = ['CreditScoreBins']

# Clean and scale numerical features (impute with median → standardize)
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# Nominal categorical preprocessing: handle missing values and create dummy variables
nominal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill NaN with 'missing'
        ('encoder', OneHotEncoder())  # Convert categories to binary columns
    ]
)

# Ordinal categorical preprocessing: handle missing values and convert to ranked numbers
ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill missing values
        ('encoder', OrdinalEncoder())  # Categories → integers (preserving order)
    ]
)


# Master preprocessor: routes each column type to its appropriate transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # Impute + Scale numbers
        ('nom', nominal_transformer, nominal_features),      # Impute + OneHotEncode
        ('ord', ordinal_transformer, ordinal_features)       # Impute + OrdinalEncode
    ],
    remainder='drop'  # Ignore columns not specified
)


nominal_features_names = []
for feature in nominal_features:
    unique_values = df[feature].unique()
    nominal_features_names.extend([f"{feature}_{val}" for val in unique_values])

df_cp = df.copy()
df_transformed = pd.DataFrame(
    preprocessor.fit_transform(df_cp),
    columns=numerical_features+nominal_features_names+ordinal_features
)
df_remainder = df[remainder_features]
df_pp = pd.concat(
        [df_transformed,df_remainder], 
        axis=1)

In [38]:
df_pp.to_csv(
    'data/processed/X_Transformed.csv',
    index=False
)