# Preprocessing

In this phase, we treat everything we identified in the EDA phase.

# Imports

In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing and utilities
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Machine learning models and tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Oversampling
from imblearn.over_sampling import SMOTE

# Loading Dataset

In [3]:
df = pd.read_csv('../database/cleaned_dataset.csv', sep=';')

# Handling missing values

In [4]:
# There were no missing values in the dataset

# Feature Encoding

As mentioned before, we use the **One-Hot Encoding** for features with less than 10 classes, while we use the **Label Encoding** for the others that have a higher count.

In [5]:
# Split Features and Target
X = df.drop(["TARGET"], axis=1)
y = df["TARGET"]

In [6]:
# Encoding Strategy for Features
# Split categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns # This composed of only the "residence_status"
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [7]:
# Perform One-Hot Encoding on the categorical columns (which is only the residence_status)
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Train Test Split, SMOTE, and Scaling

Here we perform the *train_test_split* in a custom way.

In fact, in this phase, we:
1. Split the dataset in train and test (using 80% / 20% division), considering the **stratify** parameter to leave the same distributions of the target variable.
2. Remove outliers.
3. Apply SMOTE to the minority class to make them more balanced.
4. Scale the features using a **Standard Scaler**.

In [8]:
def calculate_sampling_strategy(y, multiplier=10):
    """
    Calculate the sampling strategy for SMOTE for a binary classification problem.

    Parameters:
        y (array-like): Target variable for the training set.
        multiplier (int): Factor by which the minority class will be oversampled.

    Returns:
        dict: Sampling strategy for SMOTE.
    """
    unique_classes = pd.Series(y).value_counts()
    minority_class = unique_classes.idxmin()  # Identify the minority class (class with fewer samples)
    current_count = unique_classes[minority_class]
    
    # Define the sampling strategy for the minority class
    sampling_strategy = {minority_class: current_count * multiplier}
    return sampling_strategy

In [9]:
def apply_smote(X, y, sampling_strategy, random_state=42):
    """
    Apply SMOTE to balance the minority class in the training set.

    Parameters:
        X (pd.DataFrame): Training features.
        y (array-like): Training labels.
        sampling_strategy (dict): Sampling strategy for SMOTE.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Resampled training features and labels.
    """
    if sampling_strategy:
        smote = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
        return smote.fit_resample(X, y)
    return X, y

In [10]:
def scale_features(X_train, X_test, numerical_cols):
    """
    Scale numerical features using StandardScaler.

    Parameters:
        X_train (pd.DataFrame): Training feature set.
        X_test (pd.DataFrame): Test feature set.
        numerical_cols (list): List of numerical feature columns.

    Returns:
        tuple: Scaled training and test feature sets.
    """
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])
    return X_train_scaled, X_test_scaled

In [11]:
def custom_train_test_split(
    X, y, test_size=0.2, multiplier=10, random_state=42
):
    """
    Perform a stratified train-test split, apply SMOTE to balance the minority class, and scale features.

    Parameters:
        X (pd.DataFrame): Feature dataset.
        y (pd.Series): Target variable.
        test_size (float): Proportion of the dataset to include in the test split.
        multiplier (int): Factor by which the minority class will be oversampled.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Training and test sets with resampled training labels.
    """

    # Stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Calculate SMOTE sampling strategy
    sampling_strategy = calculate_sampling_strategy(y_train, multiplier)

    # Apply SMOTE to balance the training set
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train, sampling_strategy, random_state)

    # Scale numerical features
    numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
    X_train_scaled, X_test_scaled = scale_features(X_train_resampled, X_test, numerical_cols)

    return X_train_scaled, X_test_scaled, y_train_resampled, y_test

In [12]:
X_train, X_test, y_train, y_test = custom_train_test_split(
    X, y, test_size=0.2, multiplier=10, random_state=42
)

In [13]:
# Check the shapes
print(f"X_train: {X_train.shape}, y_train: {len(y_train)}")
print(f"X_test: {X_test.shape}, y_test: {len(y_test)}")

X_train: (59993, 21), y_train: 59993
X_test: (13575, 21), y_test: 13575


In [14]:
# Loop through all unique classes in y_train and y_test
for cls in set(y_train).union(set(y_test)):
    train_count = len(y_train[y_train == cls])
    test_count = len(y_test[y_test == cls])
    print(f"{cls} Class in y_train: {train_count}")
    print(f"{cls} Class in y_test: {test_count}")
    print("")

0 Class in y_train: 53663
0 Class in y_test: 13417

1 Class in y_train: 6330
1 Class in y_test: 158



# Saving preprocessed dataset

In [15]:
import os
import pandas as pd

# Ensure the directory exists
output_dir = "../database/preprocessed_dataset"
os.makedirs(output_dir, exist_ok=True)

# Save training data
X_train.to_csv(f"{output_dir}/X_train.csv", index=False)
y_train.to_csv(f"{output_dir}/y_train.csv", index=False)

# Save testing data
X_test.to_csv(f"{output_dir}/X_test.csv", index=False)
y_test.to_csv(f"{output_dir}/y_test.csv", index=False)

print("Training and testing datasets saved in 'database/preprocessed_dataset/'")

Training and testing datasets saved in 'database/preprocessed_dataset/'
