In [1]:
# 1. Imports
import joblib
import pandas as pd
import numpy as np
import time
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 2. Setup
vectorizer_types = ["tfidf", "count"]
max_features_list = [30000, 40000, 50000]  # new feature counts

data_path = Path("../data_preparation")
report_path = Path("../reports/benchmark_30k")
model_path = Path("../models/benchmark_30k")
report_path.mkdir(parents=True, exist_ok=True)
model_path.mkdir(parents=True, exist_ok=True)

# Load labels
y_train = joblib.load(data_path / "y_train.joblib")
y_test = joblib.load(data_path / "y_test.joblib")
label_encoder = joblib.load(data_path / "label_encoder.joblib")

In [3]:
# 3. Define models. XGBoost and Random Forest removed for poor performance
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB(),
    "SVM_Linear": LinearSVC(dual="auto"),
}

In [None]:
# 4. Benchmark loop
results = []

for vtype in vectorizer_types:
    for n in max_features_list:
        try:
            X_train = joblib.load(data_path / f"X_train_{vtype}_{n}.joblib")
            X_test = joblib.load(data_path / f"X_test_{vtype}_{n}.joblib")
        except FileNotFoundError:
            print(f"Missing: {vtype} with {n} features — skipping")
            continue

        for model_name, model in models.items():
            print(f"Running {model_name} on {vtype.upper()} ({n} features)...")
            start = time.time()

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="weighted")
            duration = time.time() - start

            results.append({
                "vectorizer": vtype,
                "features": n,
                "model": model_name,
                "accuracy": acc,
                "f1_score": f1,
                "time_sec": duration
            })

            # Save top models
            if acc >= 0.83: # First pass indicated 
                model_file = model_path / f"{model_name}_{vtype}_{n}.joblib"
                joblib.dump(model, model_file)
                print(f"✅ Saved: {model_name} ({vtype}, {n}) with acc={acc:.4f}")
            else:
                print(f"⏩ Skipped saving: {model_name} ({vtype}, {n}) with acc={acc:.4f}")

            # Save confusion matrix
            disp = ConfusionMatrixDisplay.from_predictions(
                y_test, y_pred,
                display_labels=label_encoder.classes_,
                xticks_rotation=45,
                cmap="Blues",
                normalize=None
            )
            disp.ax_.set_title(f"{model_name} ({vtype}, {n})")
            plt.tight_layout()
            plt.savefig(report_path / f"cm_{model_name}_{vtype}_{n}.png")
            plt.close()

            # Save classification report
            if acc >= 0.83: 
                report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
                with open(report_path / f"report_{model_name}_{vtype}_{n}.txt", "w") as f:
                    f.write(f"Model: {model_name}\nVectorizer: {vtype}\nFeatures: {n}\n\n")
                    f.write(report)


Running LogisticRegression on TFIDF (30000 features)...
Running NaiveBayes on TFIDF (30000 features)...
Running SVM_Linear on TFIDF (30000 features)...
Running LogisticRegression on TFIDF (40000 features)...
Running NaiveBayes on TFIDF (40000 features)...
Running SVM_Linear on TFIDF (40000 features)...
Running LogisticRegression on TFIDF (50000 features)...
Running NaiveBayes on TFIDF (50000 features)...
Running SVM_Linear on TFIDF (50000 features)...
Running LogisticRegression on COUNT (30000 features)...
Running NaiveBayes on COUNT (30000 features)...
Running SVM_Linear on COUNT (30000 features)...
Running LogisticRegression on COUNT (40000 features)...
Running NaiveBayes on COUNT (40000 features)...
Running SVM_Linear on COUNT (40000 features)...
Running LogisticRegression on COUNT (50000 features)...
Running NaiveBayes on COUNT (50000 features)...
Running SVM_Linear on COUNT (50000 features)...


In [8]:
# 5. Save results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="accuracy", ascending=False)
results_df.to_csv(report_path / "benchmark_30k_results.csv", index=False)

print("Benchmarking complete. Results saved to CSV and models/reports folder.")

Benchmarking complete. Results saved to CSV and models/reports folder.


In [9]:
# 6. Display top 10

results_df.head(10)


Unnamed: 0,vectorizer,features,model,accuracy,f1_score,time_sec
8,tfidf,50000,SVM_Linear,0.845347,0.845026,1.271147
5,tfidf,40000,SVM_Linear,0.842485,0.842199,1.086195
2,tfidf,30000,SVM_Linear,0.839896,0.839559,0.945108
6,tfidf,50000,LogisticRegression,0.830903,0.830301,13.866446
16,count,50000,NaiveBayes,0.830495,0.829518,0.033817
3,tfidf,40000,LogisticRegression,0.829677,0.829157,13.323858
0,tfidf,30000,LogisticRegression,0.82886,0.828365,7.222671
13,count,40000,NaiveBayes,0.828587,0.827723,0.044071
15,count,50000,LogisticRegression,0.824227,0.824247,13.790526
12,count,40000,LogisticRegression,0.823545,0.823592,11.75041
