In [2]:
# 1. Imports
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split

# 2. Paths and setup
vectorizer_type = "tfidf"
feature_count = 200000

model_path = Path("../models/tuning_svm")
data_path = Path("../data_preparation")
report_path = Path("../reports/label_review")
report_path.mkdir(parents=True, exist_ok=True)

# 3. Load model and label encoder
model = joblib.load(model_path / f"LinearSVC_best_{vectorizer_type}_{feature_count}.joblib")
label_encoder = joblib.load(data_path / "label_encoder.joblib")

# 4. Load cleaned data and recreate split
clean_df = pd.read_csv(data_path / "cleaned_data.csv")
X_raw = clean_df["clean_text"]
y_raw = label_encoder.transform(clean_df["Category"])

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw
)

In [3]:
# 5. Load test matrix and predict
test_matrix = joblib.load(data_path / f"X_test_{vectorizer_type}_{feature_count}.joblib")
y_pred = model.predict(test_matrix)
decision_scores = model.decision_function(test_matrix)
confidences = np.max(decision_scores, axis=1)

# 6. Rebuild aligned dataframe
aligned_test_df = clean_df.iloc[X_test_raw.index].copy()
aligned_test_df["True"] = label_encoder.inverse_transform(y_test)
aligned_test_df["Predicted"] = label_encoder.inverse_transform(y_pred)
aligned_test_df["Confidence"] = confidences

# 7. Flag likely label errors
aligned_test_df["LabelMismatch"] = aligned_test_df["True"] != aligned_test_df["Predicted"]
label_errors_df = aligned_test_df[aligned_test_df["LabelMismatch"]].sort_values(
    by="Confidence", ascending=False
)

# 8. Save report and preview
label_errors_df.to_csv(report_path / "suspected_label_errors.csv", index=False)
label_errors_df[["Question", "Answer", "True", "Predicted", "Confidence"]].head(10)


Unnamed: 0,Question,Answer,True,Predicted,Confidence
28107,Methane is a compound of hydrogen with which o...,Carbon,Entertainment,Science and Nature,2.235407
36518,Which well known cartoon character was created...,POPEYE,History,Entertainment,2.136312
26117,Which English cricket commentator on BBC's Tes...,Isa Guha,Entertainment,Sport,2.10033
9529,"In 1924, prior to his career as a paediatricia...",ROWING,Science and Nature,Sport,2.019249
35404,Oceanic Airlines are a fictitious airline who ...,LOST,Lifestyle,Entertainment,1.894728
17808,"With 214 million followers, just behind Nation...",Nike,Sport,Lifestyle,1.861839
36041,In 1976 which football team won their first ma...,SOUTHAMPTON,History,Sport,1.802026
20227,"The German romantic author E.T.A. Hoffmann, wh...",The Nutcracker,Music,Art and Literature,1.797095
23599,Which actor played West Ham football hooligan ...,Gary Oldman,Sport,Entertainment,1.766411
27993,The Spanish word for aunt appears in the name ...,Tia Maria,Sport,Lifestyle,1.723021
