# Model Training Report

Load data, metrics, and visualizations for trained models.


In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

REPORTS_DIR = Path("backend/app/ml/reports")
DATASET_PATH = Path("backend/app/ml/data/processed/training_dataset.csv")

df = pd.read_csv(DATASET_PATH)

with open(REPORTS_DIR / "model_comparison.json", "r") as f:
    model_results = json.load(f)

model_results


## Bar Chart - Model Accuracy Comparison


In [None]:
# Convert results to DataFrame
metrics_df = pd.DataFrame(model_results)

plt.figure(figsize=(10, 5))
sns.barplot(data=metrics_df, x="name", y="accuracy", palette="Blues_r")

plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.xlabel("Model")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Confusion Matrix (per model)


In [None]:
model_names = [
    "majority_baseline",
    "nutrition_logreg",
    "tfidf_logreg",
    "oracle_knn_embeddings",
    "sbert_fusion_mlp",
]

for model in model_names:
    path = REPORTS_DIR / f"{model}_report.json"
    if not path.exists():
        print(f"Skipping missing: {model}")
        continue

    with open(path, "r") as f:
        rep = json.load(f)

    true = rep["true"]
    pred = rep["pred"]

    cm = confusion_matrix(true, pred, labels=[0, 1, 2])

    disp = ConfusionMatrixDisplay(
        cm, display_labels=["Dislike (0)", "Neutral (1)", "Like (2)"]
    )
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix - {model}")
    plt.show()


## UMAP Visualization of Embeddings


In [None]:
emb_cols = [c for c in df.columns if c.startswith("emb_")]
X_emb = df[emb_cols].values
labels = df["label_3class"].values

import umap.umap_ as umap

reducer = umap.UMAP(n_neighbors=20, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_emb)

plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_umap[:, 0],
    y=X_umap[:, 1],
    hue=labels,
    palette={0: "red", 1: "gray", 2: "green"},
    alpha=0.7,
)
plt.title("UMAP Visualization of SBERT Ingredient Embeddings")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.legend(title="Label")
plt.show()


## Feature Importance


In [None]:
import joblib
from sklearn.inspection import permutation_importance

# Load the nutrition logistic regression model
nutri_model_path = Path("backend/app/ml/models/nutrition_logreg.joblib")
nutri = joblib.load(nutri_model_path)

# Select only nutrition + diet + allergen features
numeric_cols = [
    "calories",
    "fat",
    "sugar",
    "protein",
    "sodium",
    "allergen_count",
    "is_vegan",
    "is_vegetarian",
    "is_mindful",
]

X_numeric = df[numeric_cols]
y = df["label_3class"]

result = permutation_importance(
    nutri, X_numeric, y, n_repeats=8, random_state=42
)

importance_df = pd.DataFrame(
    {"feature": numeric_cols, "importance": result.importances_mean}
).sort_values(by="importance", ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(data=importance_df, x="importance", y="feature", palette="viridis")
plt.title("Permutation Feature Importance - Nutrition Logistic Regression")
plt.show()


## Error Analysis Table


In [None]:
# Load best model predictions (fusion MLP is best)
with open(REPORTS_DIR / "sbert_fusion_mlp_report.json", "r") as f:
    rep = json.load(f)

fusion_true = np.array(rep["true"])
fusion_pred = np.array(rep["pred"])

errors = df.copy()
errors["true"] = fusion_true
errors["pred"] = fusion_pred
errors["correct"] = errors["true"] == errors["pred"]

mis = errors[errors["correct"] == False]

mis_small = mis[
    [
        "meal_id",
        "menu_item_id",
        "label_3class",
        "stars_5",
        "calories",
        "fat",
        "sugar",
        "protein",
        "allergen_count",
        "true",
        "pred",
    ]
].head(25)

mis_small
