## Accuracy Calculations of our predictions against the real labels

In [5]:
import pandas as pd
from sklearn.metrics import accuracy_score

# === File paths ===
pred_files = {
    "Logistic Regression": "predictionsLR.csv",
    "Random Forest": "predictionsRF.csv",
    "RF + Transformers": "predictionsLR_Transformers_with_original_data.csv"
}

label_file = "test_labels.csv"
label_column = "label"
pred_column = "predictions"

# === Load true labels ===
true_df = pd.read_csv(label_file)
y_true = true_df[label_column].astype(str)

print("=== Accuracy Comparison ===\n")

for model_name, file_path in pred_files.items():
    pred_df = pd.read_csv(file_path)

    # Ensure the prediction column exists
    if pred_column not in pred_df.columns:
        print(f"Column '{pred_column}' not found in {file_path}")
        print(f"Available columns: {pred_df.columns.tolist()}")
        continue

    y_pred = pred_df[pred_column].astype(str)

    # Sanity check: length match
    if len(y_pred) != len(y_true):
        print(f"Mismatch in number of rows for {model_name}: predictions = {len(y_pred)}, labels = {len(y_true)}")
        continue

    accuracy = accuracy_score(y_true, y_pred)
    print(f"{model_name} ({file_path}): Accuracy = {accuracy:.4f}")


=== Accuracy Comparison ===

Logistic Regression (predictionsLR.csv): Accuracy = 0.9864
Random Forest (predictionsRF.csv): Accuracy = 0.9833
RF + Transformers (predictionsLR_Transformers_with_original_data.csv): Accuracy = 0.8773
