# Baseline Model with SMOTE

This notebook runs the preprocessing pipeline with SMOTE and trains baseline classifiers.

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

from src.preprocessing_pipeline import build_preprocessed_data

In [2]:
# Run preprocessing pipeline with SMOTE
print("Loading and preprocessing data with SMOTE...")
data = build_preprocessed_data(use_smote=True)

X_train = data["X_train_std"]
y_train = data["Y_train"]
X_val = data["X_val_std"]
y_val = data["Y_val"]
X_test = data["X_test_std"]
y_test = data["Y_test"]

print("Training shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

Loading and preprocessing data with SMOTE...
Loading the file using Polars...
File loaded successfully with Polars. Shape: (2302521, 308)
Converted the Polars DataFrame to Pandas.
Applying SMOTE...
SMOTE done.
Training shape: (5380328, 52) (5380328,)
Validation shape: (460504, 52) (460504,)
Test shape: (460505, 52) (460505,)


In [3]:
# Baseline Model: Logistic Regression
print("Training Logistic Regression baseline...")
clf = LogisticRegression(random_state=207, max_iter=1000)
clf.fit(X_train, y_train)

Training Logistic Regression baseline...


In [4]:
# Evaluate on Validation Set
print("Evaluating on Validation Set...")
y_pred_val = clf.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("\nClassification Report (Validation):\n")
print(classification_report(y_val, y_pred_val))

Evaluating on Validation Set...
Validation Accuracy: 0.8933842051317686

Classification Report (Validation):

              precision    recall  f1-score   support

           0       0.98      0.91      0.95    448361
           1       0.06      0.09      0.07     11185
           2       0.01      0.21      0.02       738
           3       0.01      0.49      0.02       220

    accuracy                           0.89    460504
   macro avg       0.26      0.42      0.26    460504
weighted avg       0.96      0.89      0.92    460504



In [5]:
# Evaluate on Test Set
print("Evaluating on Test Set...")
y_pred_test = clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n")
print(classification_report(y_test, y_pred_test))

Evaluating on Test Set...
Test Accuracy: 0.8938187424675085

Classification Report (Test):

              precision    recall  f1-score   support

           0       0.98      0.92      0.95    448364
           1       0.06      0.09      0.07     11185
           2       0.01      0.20      0.02       734
           3       0.01      0.52      0.02       222

    accuracy                           0.89    460505
   macro avg       0.27      0.43      0.27    460505
weighted avg       0.96      0.89      0.92    460505



In [6]:
# Random Forest Classifier
print("Training Random Forest Classifier...")
rf_clf = RandomForestClassifier(random_state=207, n_jobs=-1)
rf_clf.fit(X_train, y_train)

Training Random Forest Classifier...


In [7]:
# Evaluate Random Forest on Validation Set
print("Evaluating Random Forest on Validation Set...")
y_pred_val_rf = rf_clf.predict(X_val)

print("Validation Accuracy (RF):", accuracy_score(y_val, y_pred_val_rf))
print("\nClassification Report (Validation - RF):\n")
print(classification_report(y_val, y_pred_val_rf))

Evaluating Random Forest on Validation Set...
Validation Accuracy (RF): 0.9618765526466654

Classification Report (Validation - RF):

              precision    recall  f1-score   support

           0       0.98      0.98      0.98    448361
           1       0.22      0.20      0.21     11185
           2       0.09      0.03      0.04       738
           3       0.16      0.06      0.09       220

    accuracy                           0.96    460504
   macro avg       0.36      0.32      0.33    460504
weighted avg       0.96      0.96      0.96    460504



In [8]:
# Evaluate Random Forest on Test Set
print("Evaluating Random Forest on Test Set...")
y_pred_test_rf = rf_clf.predict(X_test)

print("Test Accuracy (RF):", accuracy_score(y_test, y_pred_test_rf))
print("\nClassification Report (Test - RF):\n")
print(classification_report(y_test, y_pred_test_rf))

Evaluating Random Forest on Test Set...
Test Accuracy (RF): 0.9616442818210443

Classification Report (Test - RF):

              precision    recall  f1-score   support

           0       0.98      0.98      0.98    448364
           1       0.21      0.18      0.20     11185
           2       0.09      0.03      0.04       734
           3       0.12      0.05      0.07       222

    accuracy                           0.96    460505
   macro avg       0.35      0.31      0.32    460505
weighted avg       0.96      0.96      0.96    460505



In [9]:
# XGBoost Classifier
print("Training XGBoost Classifier...")
xgb_clf = XGBClassifier(random_state=207, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_clf.fit(X_train, y_train)

Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
# Evaluate XGBoost on Validation Set
print("Evaluating XGBoost on Validation Set...")
y_pred_val_xgb = xgb_clf.predict(X_val)

print("Validation Accuracy (XGB):", accuracy_score(y_val, y_pred_val_xgb))
print("\nClassification Report (Validation - XGB):\n")
print(classification_report(y_val, y_pred_val_xgb))

Evaluating XGBoost on Validation Set...
Validation Accuracy (XGB): 0.9571816965759256

Classification Report (Validation - XGB):

              precision    recall  f1-score   support

           0       0.98      0.98      0.98    448361
           1       0.14      0.09      0.11     11185
           2       0.03      0.08      0.05       738
           3       0.05      0.24      0.08       220

    accuracy                           0.96    460504
   macro avg       0.30      0.35      0.30    460504
weighted avg       0.95      0.96      0.96    460504



In [11]:
# Evaluate XGBoost on Test Set
print("Evaluating XGBoost on Test Set...")
y_pred_test_xgb = xgb_clf.predict(X_test)

print("Test Accuracy (XGB):", accuracy_score(y_test, y_pred_test_xgb))
print("\nClassification Report (Test - XGB):\n")
print(classification_report(y_test, y_pred_test_xgb))

Evaluating XGBoost on Test Set...
Test Accuracy (XGB): 0.9575618071465022

Classification Report (Test - XGB):

              precision    recall  f1-score   support

           0       0.98      0.98      0.98    448364
           1       0.14      0.09      0.11     11185
           2       0.04      0.09      0.05       734
           3       0.04      0.21      0.07       222

    accuracy                           0.96    460505
   macro avg       0.30      0.34      0.30    460505
weighted avg       0.95      0.96      0.96    460505

