#  1. Load Data
---

In [7]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import train_test_split

X = sparse.load_npz("../data/clean/X_vectorized.npz")

y = pd.read_csv("../data/clean/y_labels.csv")["label"].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Train Several Classifiers
---

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    trained_models[name] = model


Training Logistic Regression...
Training Linear SVM...
Training Naive Bayes...
Training Random Forest...


# 3. Evaluate Each Model
---

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results = []

for name, model in trained_models.items():
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-score
0  Logistic Regression  0.985315   0.973660  0.998138  0.985747
1           Linear SVM  0.990684   0.984974  0.996896  0.990899
2          Naive Bayes  0.984683   0.986609  0.983240  0.984921
3        Random Forest  0.987052   0.984269  0.990379  0.987314


# Save the best model
---

In [10]:
import joblib

best_model = trained_models["Linear SVM"]

joblib.dump(best_model, "../models/best_model.pkl")
print("Model saved!")

Model saved!
