In [1]:
# 1. Imports
import joblib
import pandas as pd
import numpy as np
import time
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# 2. Setup
vectorizer_type = "tfidf"
max_features_list = [75000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, "full"]

data_path = Path("../data_preparation")
report_path = Path("../reports/benchmark_100k")
model_path = Path("../models/benchmark_100k")

report_path.mkdir(parents=True, exist_ok=True)
model_path.mkdir(parents=True, exist_ok=True)

# Load labels
y_train = joblib.load(data_path / "y_train.joblib")
y_test = joblib.load(data_path / "y_test.joblib")
label_encoder = joblib.load(data_path / "label_encoder.joblib")

In [3]:
# 3. Define model
model_name = "SVM_Linear"
model = LinearSVC(dual="auto")

In [16]:
# 4. Benchmark loop
results = []

for n in max_features_list:
    try:
        X_train = joblib.load(data_path / f"X_train_{vectorizer_type}_{n}.joblib")
        X_test = joblib.load(data_path / f"X_test_{vectorizer_type}_{n}.joblib")
    except FileNotFoundError:
        print(f"Missing: {vectorizer_type} with {n} features — skipping")
        continue

    print(f"Running {model_name} on {vectorizer_type.upper()} ({n} features)...")
    start = time.time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    duration = time.time() - start

    results.append({
        "vectorizer": vectorizer_type,
        "features": n,
        "model": model_name,
        "accuracy": acc,
        "f1_score": f1,
        "time_sec": duration
    })

    # Save top models
    if acc >= 0.845: # Updated to reflect previous tier
        joblib.dump(model, model_path / f"{model_name}_{vectorizer_type}_{n}.joblib")
        print(f"✅ Saved: {model_name} with acc={acc:.4f}")

    # Save confusion matrix
    disp = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred,
        display_labels=label_encoder.classes_,
        xticks_rotation=45,
        cmap="Blues",
        normalize=None
    )
    disp.ax_.set_title(f"{model_name} ({vectorizer_type}, {n})")
    plt.tight_layout()
    plt.savefig(report_path / f"cm_{model_name}_{vectorizer_type}_{n}.png")
    plt.close()

    # Save classification report
    if acc >= 0.84:
        report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
        with open(report_path / f"report_{model_name}_{vectorizer_type}_{n}.txt", "w") as f:
            f.write(f"Model: {model_name}\nVectorizer: {vectorizer_type}\nFeatures: {n}\n\n")
            f.write(report)

Running SVM_Linear on TFIDF (75000 features)...
✅ Saved: SVM_Linear with acc=0.8466
Running SVM_Linear on TFIDF (100000 features)...
✅ Saved: SVM_Linear with acc=0.8481
Running SVM_Linear on TFIDF (150000 features)...
✅ Saved: SVM_Linear with acc=0.8489
Running SVM_Linear on TFIDF (200000 features)...
✅ Saved: SVM_Linear with acc=0.8503
Running SVM_Linear on TFIDF (250000 features)...
✅ Saved: SVM_Linear with acc=0.8504
Running SVM_Linear on TFIDF (300000 features)...
✅ Saved: SVM_Linear with acc=0.8497
Running SVM_Linear on TFIDF (350000 features)...
✅ Saved: SVM_Linear with acc=0.8492
Running SVM_Linear on TFIDF (400000 features)...
✅ Saved: SVM_Linear with acc=0.8474
Running SVM_Linear on TFIDF (full features)...
✅ Saved: SVM_Linear with acc=0.8471


In [17]:
# 5. Save results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="accuracy", ascending=False)
results_df.to_csv(report_path / "benchmark_100k_results.csv", index=False)

In [18]:
# 6. Display top 10

results_df.head(10)


Unnamed: 0,vectorizer,features,model,accuracy,f1_score,time_sec
4,tfidf,250000,SVM_Linear,0.850388,0.84991,1.513547
3,tfidf,200000,SVM_Linear,0.850252,0.849815,1.388531
5,tfidf,300000,SVM_Linear,0.849707,0.849207,1.544441
6,tfidf,350000,SVM_Linear,0.849162,0.848605,1.678411
2,tfidf,150000,SVM_Linear,0.848889,0.84839,1.198627
1,tfidf,100000,SVM_Linear,0.848072,0.847659,0.987768
7,tfidf,400000,SVM_Linear,0.847391,0.84677,1.883379
8,tfidf,full,SVM_Linear,0.847118,0.84649,1.90208
0,tfidf,75000,SVM_Linear,0.846573,0.846209,1.105868
