In [2]:
# 1. Imports & config


# Compare base Qwen1.5-0.5B-Chat vs LoRA fine-tuned model on held-out recipes.

import os
import ast
import random
from typing import List, Dict, Any

import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 2. Paths & basic settings
# Paths
CLEAN_DATA_PATH = "../data/cleaned/platelogic_annotation_ready.csv"          # full cleaned dataset (~9192)
ANNOTATED_300_PATH = "../data/annotated/platelogic_annotated_300.csv"         # the unified 300 annotated recipes
LORA_ADAPTER_PATH = "../models/platelogic_qwen_lora/checkpoint-67"  # your LoRA checkpoint
EVAL_OUTPUT_PATH = "../data/platelogic_eval_results.csv"            # where to save evaluation results

MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"

# Evaluation size
N_EVAL = 50   # number of held-out recipes to test on

# Generation settings
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.2
TOP_P = 0.9

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

DEVICE, DTYPE


('cpu', torch.float32)

In [7]:
# 3. Load datasets 

def load_clean_dataset(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # standardize id column name if needed
    if "id" not in df.columns:
        if "recipe_id" in df.columns:
            df = df.rename(columns={"recipe_id": "id"})
        else:
            raise ValueError("No 'id' or 'recipe_id' column found in cleaned dataset.")
    return df


def load_annotated_300(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "id" not in df.columns:
        if "recipe_id" in df.columns:
            df = df.rename(columns={"recipe_id": "id"})
        else:
            raise ValueError("No 'id' or 'recipe_id' column found in annotated_300.")
    return df


clean_df = load_clean_dataset(CLEAN_DATA_PATH)
annot_df = load_annotated_300(ANNOTATED_300_PATH)

len(clean_df), len(annot_df)


(9192, 300)

In [8]:
# 3. create held-out set
# IDs that were used in the 300 annotated recipes (training set)
train_ids = set(annot_df["id"].tolist())

# Held-out candidates are recipes that are *not* in the 300
heldout_candidates = clean_df[~clean_df["id"].isin(train_ids)].copy()

print(f"Total cleaned recipes: {len(clean_df)}")
print(f"Annotated (train) recipes: {len(annot_df)}")
print(f"Held-out candidates: {len(heldout_candidates)}")

# Sample N_EVAL from held-out candidates
if len(heldout_candidates) < N_EVAL:
    raise ValueError(f"Not enough held-out candidates. Have {len(heldout_candidates)}, need {N_EVAL}.")

heldout_df = heldout_candidates.sample(N_EVAL, random_state=42).reset_index(drop=True)
heldout_df.head()


Total cleaned recipes: 9192
Annotated (train) recipes: 300
Held-out candidates: 8892


Unnamed: 0,id,name,ingredients,steps,calories,protein,carbs
0,328830,mozzarella chicken bake,"['chicken breasts', 'flour', 'butter', 'salt a...",['place chicken pieces in between wax paper an...,480.8,38.5,9
1,354640,french gorton spread,"['ground pork', 'onion', 'cinnamon', 'clove', ...",['put all ingredients into an uncovered deep s...,518.8,44.0,3
2,54310,mexican dinner,"['lean ground beef', 'rotel tomatoes', 'picant...","['brown ground beef', 'drain off any fat', 'ad...",405.3,41.0,3
3,211021,chicken in a red sweet pepper sauce,"['chicken meat', 'onions', 'gingerroot', 'garl...","['combine onions , ginger , garlic , almonds ,...",641.4,57.5,18
4,22156,chicken and sauerkraut,"['boneless skinless chicken breasts', 'sauerkr...","['preheat oven to 350 degrees', 'spray baking ...",279.9,32.5,6


In [9]:
# 4. Utility: ingredient parsing & main protein detection
PROTEIN_KEYWORDS = [
    "chicken", "breast", "thigh", "turkey", "beef", "pork",
    "lamb", "salmon", "tuna", "shrimp", "prawn", "egg",
    "eggs", "tofu", "tempeh", "lentil", "lentils",
    "chickpea", "chickpeas", "yogurt", "greek yogurt",
    "cottage cheese", "paneer", "ham", "sausage"
]

HIGH_CARB_FLAG_WORDS = [
    "sugar", "brown sugar", "honey", "maple syrup", "corn syrup",
    "flour", "all-purpose flour", "bread flour", "self-rising flour",
    "bread", "bun", "buns", "pasta", "spaghetti", "noodles",
    "rice", "risotto", "potato", "potatoes", "tortilla",
    "tortillas", "pita", "bagel", "bagels"
]

def safe_parse_list(val: Any) -> List[str]:
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            # fallback: split by comma
            return [x.strip() for x in val.split(",") if x.strip()]
    return []


def detect_main_protein(ingredients: List[str]) -> str:
    text = " ".join(ingredients).lower()
    for kw in PROTEIN_KEYWORDS:
        if kw in text:
            return kw
    return ""


In [10]:
# 5 — Build the instruction text & full prompt

CONSTRAINTS_TEXT = (
    "- Keep or increase lean protein.\n"
    "- Avoid added sugars.\n"
    "- Avoid starchy carbs (bread, pasta, rice, potatoes, tortillas, buns).\n"
    "- Prefer high-protein swaps (Greek yogurt, cottage cheese, legumes in moderation, etc.)."
)


def build_instruction_text(row: pd.Series) -> str:
    """
    Build the 'input' string used during training:
    title + ingredients + goal + constraints, in a single text block.
    """
    title = row.get("name", row.get("title", "Untitled recipe"))
    ingredients_list = safe_parse_list(row.get("ingredients", "[]"))
    ingredients_str = "\n".join(f"- {ing}" for ing in ingredients_list)

    goal = "Rewrite this recipe to be high-protein and low-carb while keeping the core flavor."

    instruction = f"""Title: {title}

Ingredients:
{ingredients_str}

Goal:
{goal}

Constraints:
{CONSTRAINTS_TEXT}
"""
    return instruction


def build_full_prompt(instruction_text: str) -> str:
    """
    Match the exact format used in 03_finetune_lora.ipynb:

    You are a cooking assistant that rewrites recipes to be high-protein and low-carb.

    Instruction:
    {input}

    Response:
    """
    prompt = (
        "You are a cooking assistant that rewrites recipes to be high-protein and low-carb.\n\n"
        "Instruction:\n"
        f"{instruction_text}\n\n"
        "Response:\n"
    )
    return prompt


In [11]:
# 6. Load tokenizer, base model, and LoRA model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="auto" if DEVICE == "cuda" else None
).to(DEVICE)
base_model.eval()

lora_base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="auto" if DEVICE == "cuda" else None
).to(DEVICE)
lora_model = PeftModel.from_pretrained(
    lora_base_model,
    LORA_ADAPTER_PATH
).to(DEVICE)
lora_model.eval()

DEVICE


'cpu'

In [None]:
# 7 — Generation helper 

def generate_recipe(
    model,
    tokenizer,
    prompt: str,
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = TEMPERATURE,
    top_p: float = TOP_P,
) -> str:
    """
    Generate continuation after the 'Response:' prefix.
    Uses the same plain-text prompt format as training.
    """
    inputs = tokenizer(
        prompt,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        generated = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Slice off the prompt tokens; keep only model's continuation
    output_ids = generated[0][inputs["input_ids"].shape[1]:]
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True).strip()

    return output_text


In [15]:
# 8. Automatic checks

def check_keeps_main_protein(output_text: str, main_protein: str) -> bool:
    if not main_protein:
        # no detected main protein → we can't check this reliably
        return True
    return main_protein.lower() in output_text.lower()


def check_high_carb_violation(output_text: str) -> bool:
    text = output_text.lower()
    for kw in HIGH_CARB_FLAG_WORDS:
        if kw in text:
            return True
    return False


def check_hp_lc_cues(output_text: str) -> bool:
    text = output_text.lower()
    cues = ["high-protein", "high protein", "low-carb", "low carb", "keto"]
    return any(cue in text for cue in cues)


In [17]:
# 9. Run evaluation loop

records = []

for idx, row in tqdm(heldout_df.iterrows(), total=len(heldout_df)):
    recipe_id = row["id"]
    title = row.get("name", row.get("title", "Untitled recipe"))
    ingredients = safe_parse_list(row.get("ingredients", "[]"))

    main_protein = detect_main_protein(ingredients)

    # Build instruction and prompt in the same format used for fine-tuning
    instruction_text = build_instruction_text(row)
    full_prompt = build_full_prompt(instruction_text)

    # Base model output
    base_output = generate_recipe(
        base_model,
        tokenizer,
        full_prompt
    )

    # LoRA model output
    lora_output = generate_recipe(
        lora_model,
        tokenizer,
        full_prompt
    )

    # Checks for base
    base_keeps_protein = check_keeps_main_protein(base_output, main_protein)
    base_high_carb_violation = check_high_carb_violation(base_output)
    base_hp_lc_cues = check_hp_lc_cues(base_output)
    base_pass_all = base_keeps_protein and not base_high_carb_violation

    # Checks for LoRA
    lora_keeps_protein = check_keeps_main_protein(lora_output, main_protein)
    lora_high_carb_violation = check_high_carb_violation(lora_output)
    lora_hp_lc_cues = check_hp_lc_cues(lora_output)
    lora_pass_all = lora_keeps_protein and not lora_high_carb_violation

    records.append({
        "id": recipe_id,
        "title": title,
        "main_protein": main_protein,
        "ingredients": "; ".join(ingredients),

        "base_output": base_output,
        "base_keeps_main_protein": base_keeps_protein,
        "base_high_carb_violation": base_high_carb_violation,
        "base_mentions_hp_lc": base_hp_lc_cues,
        "base_pass_all_constraints": base_pass_all,

        "lora_output": lora_output,
        "lora_keeps_main_protein": lora_keeps_protein,
        "lora_high_carb_violation": lora_high_carb_violation,
        "lora_mentions_hp_lc": lora_hp_lc_cues,
        "lora_pass_all_constraints": lora_pass_all,
    })

eval_df = pd.DataFrame.from_records(records)
eval_df.head()

100%|██████████| 50/50 [1:19:32<00:00, 95.45s/it] 


Unnamed: 0,id,title,main_protein,ingredients,base_output,base_keeps_main_protein,base_high_carb_violation,base_mentions_hp_lc,base_pass_all_constraints,lora_output,lora_keeps_main_protein,lora_high_carb_violation,lora_mentions_hp_lc,lora_pass_all_constraints
0,328830,mozzarella chicken bake,chicken,chicken breasts; flour; butter; salt and peppe...,Title: Mzzarella Chicken Bake\n\nIngredients:\...,True,True,True,False,Title: mozzarella chicken bake\n\nIngredients:...,True,True,True,False
1,354640,french gorton spread,pork,ground pork; onion; cinnamon; clove; salt and ...,Title: French Gorton Spread\n\nIngredients:\n\...,False,False,False,False,Title: French Grown Torn Spread\n\nIngredients...,True,True,True,False
2,54310,mexican dinner,beef,lean ground beef; rotel tomatoes; picante sauc...,Title: Mexican dinner\n\nIngredients:\n\n-瘦 gr...,True,True,True,False,Title: Mexican chicken stir-fry with fresh cil...,True,False,False,True
3,211021,chicken in a red sweet pepper sauce,chicken,chicken meat; onions; gingerroot; garlic clove...,Title: Chicken in a Red Sweet Pepper Sauce\n\n...,True,False,False,True,Title: Chicken in a Red Sweet Pepper Sauce\n\n...,True,False,False,True
4,22156,chicken and sauerkraut,chicken,boneless skinless chicken breasts; sauerkraut;...,Title: Chicken and Sausk\n\nIngredients:\n\n- ...,True,False,False,True,Title: Chicken and Sauerkraut\n\nIngredients:\...,True,True,True,False


In [18]:
# Save per-recipe evaluation to CSV
os.makedirs(os.path.dirname(EVAL_OUTPUT_PATH), exist_ok=True)
eval_df.to_csv(EVAL_OUTPUT_PATH, index=False)

print(f"Saved per-recipe evaluation to: {EVAL_OUTPUT_PATH}")

Saved per-recipe evaluation to: ../data/platelogic_eval_results.csv


In [19]:
# 10. Aggregate metrics

def summarize_model(prefix: str, df: pd.DataFrame) -> Dict[str, float]:
    keeps = df[f"{prefix}_keeps_main_protein"].mean()
    high_carb_rate = df[f"{prefix}_high_carb_violation"].mean()
    pass_all = df[f"{prefix}_pass_all_constraints"].mean()
    mentions_hp_lc = df[f"{prefix}_mentions_hp_lc"].mean()

    return {
        "pass_rate": pass_all,
        "protein_retention_rate": keeps,
        "high_carb_violation_rate": high_carb_rate,
        "hp_lc_mention_rate": mentions_hp_lc,
    }

base_metrics = summarize_model("base", eval_df)
lora_metrics = summarize_model("lora", eval_df)

print("Base model metrics:")
for k, v in base_metrics.items():
    print(f"  {k}: {v:.3f}")

print("\nLoRA model metrics:")
for k, v in lora_metrics.items():
    print(f"  {k}: {v:.3f}")


Base model metrics:
  pass_rate: 0.460
  protein_retention_rate: 0.820
  high_carb_violation_rate: 0.380
  hp_lc_mention_rate: 0.320

LoRA model metrics:
  pass_rate: 0.100
  protein_retention_rate: 0.920
  high_carb_violation_rate: 0.900
  hp_lc_mention_rate: 0.840


In [21]:
# Save summary to JSON
import json

summary = {
    "n_eval": len(eval_df),
    "base": base_metrics,
    "lora": lora_metrics,
}

with open("../data/platelogic_eval_summary.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

summary

{'n_eval': 50,
 'base': {'pass_rate': np.float64(0.46),
  'protein_retention_rate': np.float64(0.82),
  'high_carb_violation_rate': np.float64(0.38),
  'hp_lc_mention_rate': np.float64(0.32)},
 'lora': {'pass_rate': np.float64(0.1),
  'protein_retention_rate': np.float64(0.92),
  'high_carb_violation_rate': np.float64(0.9),
  'hp_lc_mention_rate': np.float64(0.84)}}