In [18]:
# 1. Imports
import joblib
import pandas as pd
import numpy as np
import time
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# 2. Setup
vectorizer_types = ["tfidf", "count"]
max_features_list = [5000, 10000, 20000]
data_path = Path("../data_preparation")
report_path = Path("../reports")
model_path = Path("../models")

report_path.mkdir(parents=True, exist_ok=True)
model_path.mkdir(parents=True, exist_ok=True)

# Load labels
y_train = joblib.load(data_path / "y_train.joblib")
y_test = joblib.load(data_path / "y_test.joblib")
label_encoder = joblib.load(data_path / "label_encoder.joblib")

In [21]:
# 3. Define models to benchmark
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_jobs=-1),
    "NaiveBayes": MultinomialNB(),
    "SVM_Linear": LinearSVC(dual="auto"),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", verbosity=0)
}



In [22]:
# 4. Benchmark loop
results = []

for vtype in vectorizer_types:
    for n in max_features_list:
        try:
            X_train = joblib.load(data_path / f"X_train_{vtype}_{n}.joblib")
            X_test = joblib.load(data_path / f"X_test_{vtype}_{n}.joblib")
        except FileNotFoundError:
            print(f"Missing: {vtype} with {n} features — skipping")
            continue

        for model_name, model in models.items():
            print(f"Running {model_name} on {vtype.upper()} ({n} features)...")
            start = time.time()

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="weighted")
            duration = time.time() - start

            results.append({
                "vectorizer": vtype,
                "features": n,
                "model": model_name,
                "accuracy": acc,
                "f1_score": f1,
                "time_sec": duration
            })

            # Save top-performing models
            if acc >= 0.81: # First pass indicated 
                model_file = model_path / f"{model_name}_{vtype}_{n}.joblib"
                joblib.dump(model, model_file)
                print(f"✅ Saved: {model_name} ({vtype}, {n}) with acc={acc:.4f}")
            else:
                print(f"⏩ Skipped saving: {model_name} ({vtype}, {n}) with acc={acc:.4f}")

            # Save confusion matrix
            disp = ConfusionMatrixDisplay.from_predictions(
                y_test, y_pred,
                display_labels=label_encoder.classes_,
                xticks_rotation=45,
                cmap="Blues",
                normalize=None
            )
            disp.ax_.set_title(f"{model_name} ({vtype}, {n})")
            plt.tight_layout()
            plt.savefig(report_path / f"cm_{model_name}_{vtype}_{n}.png")
            plt.close()

                        # Save classification report for top models
            if acc >= 0.81:
                report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
                with open(report_path / f"report_{model_name}_{vtype}_{n}.txt", "w") as f:
                    f.write(f"Model: {model_name}\nVectorizer: {vtype}\nFeatures: {n}\n\n")
                    f.write(report)



Running LogisticRegression on TFIDF (5000 features)...
✅ Saved: LogisticRegression (tfidf, 5000) with acc=0.8148
Running RandomForest on TFIDF (5000 features)...
⏩ Skipped saving: RandomForest (tfidf, 5000) with acc=0.7645
Running NaiveBayes on TFIDF (5000 features)...
⏩ Skipped saving: NaiveBayes (tfidf, 5000) with acc=0.8000
Running SVM_Linear on TFIDF (5000 features)...
⏩ Skipped saving: SVM_Linear (tfidf, 5000) with acc=0.8098
Running XGBoost on TFIDF (5000 features)...
⏩ Skipped saving: XGBoost (tfidf, 5000) with acc=0.7865
Running LogisticRegression on TFIDF (10000 features)...
✅ Saved: LogisticRegression (tfidf, 10000) with acc=0.8264
Running RandomForest on TFIDF (10000 features)...
⏩ Skipped saving: RandomForest (tfidf, 10000) with acc=0.7705
Running NaiveBayes on TFIDF (10000 features)...
✅ Saved: NaiveBayes (tfidf, 10000) with acc=0.8135
Running SVM_Linear on TFIDF (10000 features)...
✅ Saved: SVM_Linear (tfidf, 10000) with acc=0.8231
Running XGBoost on TFIDF (10000 features

In [23]:
# 5. Save results to DataFrame and CSV

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="accuracy", ascending=False)

results_df.to_csv(report_path / "model_benchmark_results.csv", index=False)
print("Benchmarking complete. Results saved to CSV and models/reports folder.")


Benchmarking complete. Results saved to CSV and models/reports folder.


In [24]:
# 6. Display top 10

results_df.head(10)


Unnamed: 0,vectorizer,features,model,accuracy,f1_score,time_sec
13,tfidf,20000,SVM_Linear,0.832947,0.832552,2.094904
10,tfidf,20000,LogisticRegression,0.829132,0.828651,5.704407
5,tfidf,10000,LogisticRegression,0.826407,0.825905,3.873461
8,tfidf,10000,SVM_Linear,0.823137,0.82276,0.790426
27,count,20000,NaiveBayes,0.822047,0.821386,0.03242
25,count,20000,LogisticRegression,0.819594,0.819697,7.246189
12,tfidf,20000,NaiveBayes,0.816596,0.814855,0.032599
0,tfidf,5000,LogisticRegression,0.814825,0.814231,2.619142
22,count,10000,NaiveBayes,0.814689,0.814198,0.016934
7,tfidf,10000,NaiveBayes,0.813462,0.812042,0.020163
