# Baseline Model with Class Weights (No SMOTE)

This notebook runs the preprocessing pipeline **without** SMOTE and trains baseline classifiers using class weights to handle imbalance.

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

from src.preprocessing_pipeline import build_preprocessed_data

In [12]:
# Run preprocessing pipeline WITHOUT SMOTE
print("Loading and preprocessing data (No SMOTE)...")
data = build_preprocessed_data(use_smote=False)

X_train = data["X_train_std"]
y_train = data["Y_train"]
X_val = data["X_val_std"]
y_val = data["Y_val"]
X_test = data["X_test_std"]
y_test = data["Y_test"]

print("Training shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

Loading and preprocessing data (No SMOTE)...
[Memory] Pipeline start: 14.53 GB
Loading the file using Polars...
File loaded successfully with Polars. Shape: (2302521, 308)
Converted the Polars DataFrame to Pandas.
[Memory] After data load: 21.27 GB
Loaded 2302521 rows
[Memory] After sampling: 15.87 GB
Imputing with global mean...
[Memory] Pipeline complete: 13.94 GB
Training shape: (2400, 52) (2400,)
Validation shape: (800, 52) (800,)
Test shape: (800, 52) (800,)


In [13]:
# Compute sample weights for XGBoost (and verification)
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train
)
print("Sample weights computed. Mean:", np.mean(sample_weights))

Sample weights computed. Mean: 0.9999999999999998


In [14]:
# Baseline Model: Logistic Regression (class_weight='balanced')
print("Training Logistic Regression (balanced)...")
clf = LogisticRegression(random_state=207, max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)

Training Logistic Regression (balanced)...


In [15]:
# Evaluate Logistic Regression on Validation Set
print("Evaluating Logistic Regression on Validation Set...")
y_pred_val = clf.predict(X_val)

print("Validation Accuracy (LR):", accuracy_score(y_val, y_pred_val))
print("\nClassification Report (Validation - LR):\n")
print(classification_report(y_val, y_pred_val))

Evaluating Logistic Regression on Validation Set...
Validation Accuracy (LR): 0.49125

Classification Report (Validation - LR):

              precision    recall  f1-score   support

           0       0.63      0.61      0.62       200
           1       0.42      0.40      0.41       200
           2       0.41      0.36      0.39       214
           3       0.50      0.61      0.55       186

    accuracy                           0.49       800
   macro avg       0.49      0.50      0.49       800
weighted avg       0.49      0.49      0.49       800



In [16]:
# Random Forest Classifier (class_weight='balanced')
print("Training Random Forest Classifier (balanced)...")
rf_clf = RandomForestClassifier(random_state=207, n_jobs=-1, class_weight='balanced')
rf_clf.fit(X_train, y_train)

Training Random Forest Classifier (balanced)...


In [17]:
# Evaluate Random Forest on Validation Set
print("Evaluating Random Forest on Validation Set...")
y_pred_val_rf = rf_clf.predict(X_val)

print("Validation Accuracy (RF):", accuracy_score(y_val, y_pred_val_rf))
print("\nClassification Report (Validation - RF):\n")
print(classification_report(y_val, y_pred_val_rf))

Evaluating Random Forest on Validation Set...
Validation Accuracy (RF): 0.51625

Classification Report (Validation - RF):

              precision    recall  f1-score   support

           0       0.63      0.69      0.66       200
           1       0.46      0.38      0.42       200
           2       0.42      0.38      0.40       214
           3       0.53      0.62      0.57       186

    accuracy                           0.52       800
   macro avg       0.51      0.52      0.51       800
weighted avg       0.51      0.52      0.51       800



In [18]:
# XGBoost Classifier (using sample_weight)
print("Training XGBoost Classifier (weighted)...")
xgb_clf = XGBClassifier(random_state=207, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_clf.fit(X_train, y_train, sample_weight=sample_weights)

Training XGBoost Classifier (weighted)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [19]:
# Evaluate XGBoost on Validation Set
print("Evaluating XGBoost on Validation Set...")
y_pred_val_xgb = xgb_clf.predict(X_val)

print("Validation Accuracy (XGB):", accuracy_score(y_val, y_pred_val_xgb))
print("\nClassification Report (Validation - XGB):\n")
print(classification_report(y_val, y_pred_val_xgb))

Evaluating XGBoost on Validation Set...
Validation Accuracy (XGB): 0.52875

Classification Report (Validation - XGB):

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       200
           1       0.48      0.47      0.48       200
           2       0.41      0.42      0.42       214
           3       0.53      0.58      0.56       186

    accuracy                           0.53       800
   macro avg       0.54      0.53      0.53       800
weighted avg       0.53      0.53      0.53       800

