Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import joblib

x_train_unbal = pd.read_csv('/content/drive/MyDrive/unbalanced_split/X_train_unbal.csv')
x_test_unbal = pd.read_csv('/content/drive/MyDrive/unbalanced_split/X_test_unbal.csv')
df_unbalanced = pd.read_csv('/content/drive/MyDrive/df_unbalanced.csv')
y_train_unbal = pd.read_csv('/content/drive/MyDrive/unbalanced_split/y_train_unbal.csv').squeeze()
y_test_unbal = pd.read_csv('/content/drive/MyDrive/unbalanced_split/y_test_unbal.csv').squeeze()
X_train_unbal_vec = joblib.load('/content/drive/MyDrive/unbalanced_split/X_train_unbal_vec.pkl')
X_test_unbal_vec = joblib.load('/content/drive/MyDrive/unbalanced_split/X_test_unbal_vec.pkl')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import joblib

In [None]:
df_unbalanced.shape

(10000, 2)

Logistic Regression

In [None]:
# Initialize model
lr_unbal = LogisticRegression(max_iter=1000, class_weight='balanced')

# Define hyperparameter grid
param_grid_lr_unbal = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

# Apply GridSearchCV
grid_lr_unbal = GridSearchCV(
    lr_unbal,
    param_grid=param_grid_lr_unbal,
    cv=5,
    scoring='f1_macro',  # better for imbalanced classes
    n_jobs=-1,
    verbose=2
)

grid_lr_unbal.fit(X_train_unbal_vec, y_train_unbal)

print("Best Parameters:", grid_lr_unbal.best_params_)
print("Best CV Score:", grid_lr_unbal.best_score_)

best_lr_unbal = grid_lr_unbal.best_estimator_


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 1, 'solver': 'lbfgs'}
Best CV Score: 0.46109354479956116


In [None]:
from sklearn.metrics import roc_auc_score
y_pred_unbal = best_lr_unbal.predict(X_test_unbal_vec)
print("\nClassification Report:\n", classification_report(y_test_unbal, y_pred_unbal))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_unbal, y_pred_unbal))
print("\nAccuracy:", accuracy_score(y_test_unbal, y_pred_unbal))

if hasattr(best_lr_unbal, "predict_proba"):
    y_prob_unbal = best_lr_unbal.predict_proba(X_test_unbal_vec)
    roc_auc = roc_auc_score(y_test_unbal, y_prob_unbal, multi_class="ovr", average="macro")
    print("ROC-AUC (multi-class, macro avg):", roc_auc)


Classification Report:
               precision    recall  f1-score   support

           1       0.51      0.64      0.57       200
           2       0.35      0.36      0.35       300
           3       0.45      0.38      0.41       500
           4       0.51      0.45      0.47       600
           5       0.51      0.63      0.56       400

    accuracy                           0.47      2000
   macro avg       0.46      0.49      0.47      2000
weighted avg       0.47      0.47      0.47      2000


Confusion Matrix:
 [[127  45  13   7   8]
 [ 56 107  79  40  18]
 [ 42  96 188 122  52]
 [ 14  40 114 267 165]
 [ 10  20  28  90 252]]

Accuracy: 0.4705
ROC-AUC (multi-class, macro avg): 0.7938265209150327


Random Forest

In [None]:
rf_unbal = RandomForestClassifier(random_state=42, class_weight='balanced')

param_grid_rf_unbal = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10, 30],
    'min_samples_split': [2, 5]
}

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

rand_rf_unbal = RandomizedSearchCV(
    rf_unbal,
    param_distributions=param_grid_rf_unbal,
    n_iter=6,  # test only 6 combinations
    cv=3,      # use fewer folds
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2,
    random_state=42
)
rand_rf_unbal.fit(X_train_unbal_vec, y_train_unbal)

print("Best Random Forest Params:", rand_rf_unbal.best_params_)
print("Best CV Score:", rand_rf_unbal.best_score_)
best_rand_rf_unbal = rand_rf_unbal.best_estimator_

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Random Forest Params: {'n_estimators': 300, 'min_samples_split': 2, 'max_depth': 30}
Best CV Score: 0.4030148015932707


In [None]:
from sklearn.metrics import classification_report

# Predict on the test set
y_pred_unbal = best_rand_rf_unbal.predict(X_test_unbal_vec)

# Print classification report
print("Classification Report for Random Forest (Imbalanced Data):")
print(classification_report(y_test_unbal, y_pred_unbal))


Classification Report for Random Forest (Imbalanced Data):
              precision    recall  f1-score   support

           1       0.46      0.70      0.55       200
           2       0.33      0.07      0.12       300
           3       0.39      0.35      0.37       500
           4       0.45      0.42      0.43       600
           5       0.43      0.65      0.51       400

    accuracy                           0.42      2000
   macro avg       0.41      0.44      0.40      2000
weighted avg       0.41      0.42      0.40      2000



SVM

In [None]:
svm_unbal = SVC(probability=True, class_weight='balanced')

param_grid_svm_unbal = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid_svm_unbal = GridSearchCV(
    svm_unbal,
    param_grid=param_grid_svm_unbal,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2
)

grid_svm_unbal.fit(X_train_unbal_vec, y_train_unbal)
print("Best SVM Params:", grid_svm_unbal.best_params_)
best_svm_unbal = grid_svm_unbal.best_estimator_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best SVM Params: {'C': 1, 'kernel': 'linear'}


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on the test set
y_pred_svm_unbal = best_svm_unbal.predict(X_test_unbal_vec)

# Print classification report
print("Classification Report for SVM (Imbalanced Data):")
print(classification_report(y_test_unbal, y_pred_svm_unbal))

Naive Bayes

In [None]:
nb_unbal = MultinomialNB()
nb_unbal.fit(X_train_unbal_vec, y_train_unbal)

y_pred_nb_unbal = nb_unbal.predict(X_test_unbal_vec)
print("Naive Bayes Report:\n", classification_report(y_test_unbal, y_pred_nb_unbal))

Comparing models

In [None]:
models_unbal = {
    "Logistic Regression": best_lr_unbal,
    "Random Forest": best_rand_rf_unbal,
    "SVM": best_svm_unbal,
    "Naive Bayes": nb_unbal
}

for name, model in models_unbal.items():
    y_pred = model.predict(X_test_unbal_vec)
    acc = accuracy_score(y_test_unbal, y_pred)
    f1 = classification_report(y_test_unbal, y_pred, output_dict=True)['weighted avg']['f1-score']
    print(f"\n{name} → Accuracy: {acc:.4f}, Weighted F1: {f1:.4f}")

In [None]:
import joblib

# Save Model_B (trained on imbalanced data)
joblib.dump(best_lr_unbal, '/content/drive/MyDrive/ModelB_imbalanced.pkl')
print("✅ Model_B saved successfully (imbalanced).")

✅ Model_B saved successfully (imbalanced).


In [None]:
vectorizer_unbal = joblib.load('/content/drive/MyDrive/unbalanced_split/vectorizer_unbal.pkl')