In [2]:
# 1. Imports
import joblib
import pandas as pd
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

In [3]:
# 2. Setup
vectorizer_type = "tfidf"
feature_count = 200000

data_path = Path("../data_preparation")
report_path = Path("../reports/tuning_svm")
model_path = Path("../models/tuning_svm")

report_path.mkdir(parents=True, exist_ok=True)
model_path.mkdir(parents=True, exist_ok=True)

# Load data
y_train = joblib.load(data_path / "y_train.joblib")
y_test = joblib.load(data_path / "y_test.joblib")
label_encoder = joblib.load(data_path / "label_encoder.joblib")
X_train = joblib.load(data_path / f"X_train_{vectorizer_type}_{feature_count}.joblib")
X_test = joblib.load(data_path / f"X_test_{vectorizer_type}_{feature_count}.joblib")


In [4]:
# 3. Define parameter grid
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "class_weight": [None, "balanced"],
    "dual": ["auto"]
}

model = LinearSVC()

In [5]:
# 4. Grid Search with Cross-Validation
grid = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...C=0.01, class_weight=None, dual=auto, penalty=l2; total time=   2.9s
[CV] END ...C=0.01, class_weight=None, dual=auto, penalty=l2; total time=   3.0s
[CV] END ...C=0.01, class_weight=None, dual=auto, penalty=l2; total time=   3.0s
[CV] END C=0.01, class_weight=balanced, dual=auto, penalty=l2; total time=   3.2s
[CV] END ...C=0.01, class_weight=None, dual=auto, penalty=l2; total time=   3.2s
[CV] END ...C=0.01, class_weight=None, dual=auto, penalty=l2; total time=   3.2s
[CV] END C=0.01, class_weight=balanced, dual=auto, penalty=l2; total time=   3.2s
[CV] END C=0.01, class_weight=balanced, dual=auto, penalty=l2; total time=   3.3s
[CV] END C=0.01, class_weight=balanced, dual=auto, penalty=l2; total time=   3.3s
[CV] END C=0.01, class_weight=balanced, dual=auto, penalty=l2; total time=   3.3s
[CV] END ....C=0.1, class_weight=None, dual=auto, penalty=l2; total time=   4.1s
[CV] END ....C=0.1, class_weight=None, dual=

In [6]:
# 5. Save best model
best_model = grid.best_estimator_
joblib.dump(best_model, model_path / f"LinearSVC_best_{vectorizer_type}_{feature_count}.joblib")


['../models/tuning_svm/LinearSVC_best_tfidf_200000.joblib']

In [7]:
# 6. Evaluate on test set
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Test Accuracy: 0.8485
Test F1 Score: 0.8482

Classification Report:
                    precision    recall  f1-score   support

Art and Literature       0.84      0.83      0.84      1150
     Entertainment       0.87      0.89      0.88      1153
         Geography       0.79      0.80      0.79       891
           History       0.83      0.80      0.82       962
         Lifestyle       0.78      0.75      0.76       714
             Music       0.85      0.87      0.86       743
Science and Nature       0.89      0.89      0.89       837
             Sport       0.93      0.94      0.93       889

          accuracy                           0.85      7339
         macro avg       0.85      0.85      0.85      7339
      weighted avg       0.85      0.85      0.85      7339



In [8]:
# 7. Save report
with open(report_path / f"svm_tuning_report_{vectorizer_type}_{feature_count}.txt", "w") as f:
    f.write(f"Best Parameters: {grid.best_params_}\n")
    f.write(f"Test Accuracy: {acc:.4f}\n")
    f.write(f"Test F1 Score: {f1:.4f}\n\n")
    f.write(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [9]:
# 8. Save CV results to CSV
cv_results = pd.DataFrame(grid.cv_results_)
cv_results.to_csv(report_path / f"svm_tuning_cv_results_{vectorizer_type}_{feature_count}.csv", index=False)

print("Tuning complete. Model and reports saved.")

Tuning complete. Model and reports saved.
