In [1]:
import joblib
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, classification_report


In [2]:
test_df = pd.read_parquet("data/test.parquet")
test_df = test_df.dropna(subset=["text", "label"]).copy()

def clean_text(text):
    import re
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

X_test = test_df["text"].astype(str).map(clean_text)
y_test = test_df["label"]

print("Test samples:", len(X_test))
print("Labels:", y_test.unique())


Test samples: 734039
Labels: ['Neutral' 'Highly Biased' 'Slightly Biased']


In [3]:
baseline_model = joblib.load("models/tfidf_logreg.joblib")
word_char_model = joblib.load("models/tfidf_word_char_logreg.joblib")

models = {
    "TF-IDF (Word only)": baseline_model,
    "TF-IDF (Word + Char)": word_char_model
}


In [4]:
results = []

for name, model in models.items():
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average="macro")
    f1_weighted = f1_score(y_test, y_pred, average="weighted")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Macro F1": f1_macro,
        "Weighted F1": f1_weighted
    })

    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro F1:", f1_macro)
    print("Weighted F1:", f1_weighted)



=== TF-IDF (Word only) ===
Accuracy: 0.7958078521713424
Macro F1: 0.7734641086150816
Weighted F1: 0.797294331120266

=== TF-IDF (Word + Char) ===
Accuracy: 0.8183883962568747
Macro F1: 0.8003349267393717
Weighted F1: 0.8206418943048736


In [5]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Macro F1,Weighted F1
0,TF-IDF (Word only),0.795808,0.773464,0.797294
1,TF-IDF (Word + Char),0.818388,0.800335,0.820642


In [6]:
print("\n=== Class-wise performance: Baseline ===")
print(classification_report(y_test, baseline_model.predict(X_test), digits=4))

print("\n=== Class-wise performance: Word + Char ===")
print(classification_report(y_test, word_char_model.predict(X_test), digits=4))



=== Class-wise performance: Baseline ===
                 precision    recall  f1-score   support

  Highly Biased     0.7531    0.8169    0.7837    148777
        Neutral     0.9013    0.8610    0.8807    377055
Slightly Biased     0.6494    0.6628    0.6560    208207

       accuracy                         0.7958    734039
      macro avg     0.7679    0.7802    0.7735    734039
   weighted avg     0.7998    0.7958    0.7973    734039


=== Class-wise performance: Word + Char ===
                 precision    recall  f1-score   support

  Highly Biased     0.7894    0.8354    0.8117    148777
        Neutral     0.9202    0.8666    0.8926    377055
Slightly Biased     0.6757    0.7189    0.6966    208207

       accuracy                         0.8184    734039
      macro avg     0.7951    0.8070    0.8003    734039
   weighted avg     0.8244    0.8184    0.8206    734039

