# Installations and setup

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install --no-deps evaluate rouge_score

In [2]:
from evaluate import load # Import from evaluate instead of datasets
import numpy as np
from tqdm import tqdm
import json
from unsloth import FastLanguageModel
import torch

random_state = 3407

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = random_state,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.10.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Dataset Prep

In [6]:
prompt = """
### Human:
{}

### Assistant
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("azimidokht/recipe-recom", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=random_state)

train_dataset = split_dataset["train"]
temp_dataset  = split_dataset["test"]

val_test_split = temp_dataset.train_test_split(test_size=0.5, shuffle=True, seed=random_state)

val_dataset  = val_test_split["train"]
test_dataset = val_test_split["test"]


recipe_recom.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5149 [00:00<?, ? examples/s]

Map:   0%|          | 0/5149 [00:00<?, ? examples/s]

# Utilities and functions

In [None]:
prompt = """
### Human:
{}

### Assistant
{}"""
rouge = load("rouge")
bleu = load("bleu")

inferenced_data = {}

def generate_predictions(dataset, model, tokenizer, max_new_tokens=128):
    """Generate model outputs for a HF datasets split of dicts with keys: instruction, output."""
    predictions, references, instructions = [], [], []
    FastLanguageModel.for_inference(model)
    for example in tqdm(dataset):
        instruction = example["instruction"]
        reference   = example["output"]

        inputs = tokenizer(
            [prompt.format(instruction, "")],
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # It Extracts just the assistant section
        # Falls back to full text if the delimiter isn't present
        #If the model output doesnâ€™t include the separator (### Assistant), use the full text as the prediction instead of trying to split it.
        if "### Assistant" in generated_text:
            assistant_response = generated_text.split("### Assistant", 1)[-1].strip()
        else:
            assistant_response = generated_text.strip()

        predictions.append(assistant_response)
        references.append(reference)
        instructions.append(instruction)
    return predictions, references, instructions
    

def evaluate(predictions, references, metrics: list[str]):
    """
    Evaluate the predictions against the references using the specified metrics.
    """
    results = {}
    if "rouge" in metrics:
      results["rouge"] = rouge.compute(predictions=predictions, references=references)
    if "bleu" in metrics:
      results["bleu"] = bleu.compute(predictions=predictions, references=[[ref] for ref in references]) # BLEU expects a list of references
    else:
      raise ValueError("Invalid metric")

    return results

def store(data, path):
  with open(path, "w") as f:
    json.dump(data, f)



Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

# Base model evaluation

In [None]:

print("\nGenerating predictions for the validation dataset...")
val_predictions_base, val_references_base, val_inst_base = generate_predictions(val_dataset, model, tokenizer)
print("Generating predictions for the test dataset...")
test_predictions, test_references, test_inst = generate_predictions(test_dataset, model, tokenizer)



Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

KeyboardInterrupt: 

In [None]:
inferenced_data["test"] = [val_predictions_base, val_references_base, val_inst_base]
inferenced_data["val"] = [test_predictions_base, test_references_base, test_inst_base]

store(inferenced_data, "./base-inference.json")

val_results = evaluate(val_predictions_base, val_references_base, metrics=["rouge", "bleu"])
print("Validation Rouge results:", val_results["rouge"])
print("Validation BLEU results:", val_results["bleu"])

test_results = evaluate(test_predictions_base, test_references_base, metrics=["rouge", "bleu"])
print("Test Rouge results:", test_results["rouge"])
print("Test BLEU results:", test_results["bleu"])

## Hallucination

In [2]:
import json
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd

# CONFIG - change file names / keys if needed
JSON_PATH = "../results/base-inference-r8.json"   # <- path to your jsondump
CSV_PATH = "../data/pp_recipes.csv"          # <- hummus recipes CSV
OUTPUT_SUMMARY_CSV = "constraint_evaluation_summary_base.csv"

# Columns mapping from your description (adjust if different)
COLS = {
    "title": "title",
    "calories": "calories [cal]",
    "protein": "protein [g]",
    "sodium": "sodium [mg]",
    "duration": "duration",           # may be string/object - ensure numeric if possible
    "serves": "serves",
    "total_fat": "totalFat [g]",
    "carbs": "totalCarbohydrate [g]",
    "fiber": "dietaryFiber [g]",
    "ingredients": "ingredients"
}

# -------------------------
# Helper functions
# -------------------------
def load_inputs(json_path: str) -> Dict[str, Any]:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_df(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, index_col=0, low_memory=True)
    # Normalise title for matching
    df[COLS["title"]] = df[COLS["title"]].astype(str).str.strip()
    # Try converting numeric columns
    for c in ["calories", "protein", "sodium", "duration", "total_fat", "carbs", "fiber"]:
        colname = COLS.get(c)
        if colname in df.columns:
            # remove non-numeric and coerce
            df[colname] = pd.to_numeric(df[colname].astype(str).str.replace(r"[^\d\.\-]", "", regex=True), errors="coerce")
    # Normalize ingredients column to lowercase string for containment checks
    if COLS["ingredients"] in df.columns:
        df[COLS["ingredients"]] = df[COLS["ingredients"]].astype(str).str.lower()
    return df


def extract_title_and_claims(model_output: str) -> Tuple[Optional[str], Dict[str, float], List[str]]:
    """
    Extract a claimed title and numeric claims from model output.
    Returns (title, numeric_claims, list_of_mentioned_ingredients)
    Numeric claims keys: calories, protein, sodium, duration, serves, fiber, total_fat, carbs
    This function uses heuristic regex matching on outputs like:
      "Hummus Delight - 350 calories, 15g protein, ready in 20 minutes."
      "Title - Takes 12 minutes, 300.0 calories"
    """
    claims = {}
    ing_list = []

    if not isinstance(model_output, str):
        return None, claims, ing_list

    # Attempt to parse "Title - ..." or "Title: ..." or "Title â€” ..."
    m = re.match(r'^\s*([^\-\â€“\â€”\:]+?)\s*(?:[-\â€“\â€”\:])\s*(.*)$', model_output.strip())
    if m:
        title = m.group(1).strip()
        rest = m.group(2)
    else:
        # fallback: first token phrase before comma
        parts = model_output.split(",")
        title = parts[0].strip()
        rest = ", ".join(parts[1:]) if len(parts) > 1 else ""

    # numeric captures
    # calories: "350 calories" or "350.0 calories"
    mcal = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*(?:calories|cal|kcal)\b', model_output, flags=re.I)
    if mcal:
        claims["calories"] = float(mcal.group(1))
    # protein: "15g protein"
    mprot = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:protein)\b', model_output, flags=re.I)
    if mprot:
        claims["protein"] = float(mprot.group(1))
    # sodium: "180mg sodium"
    msod = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*mg\s*(?:sodium)\b', model_output, flags=re.I)
    if msod:
        claims["sodium"] = float(msod.group(1))
    # duration: "ready in 20 minutes", "Takes 12 minutes"
    mtime = re.search(r'(\d+)\s*(?:minutes|min|mins)\b', model_output, flags=re.I)
    if mtime:
        claims["duration"] = float(mtime.group(1))
    # serves: "serves 4"
    mserves = re.search(r'serves\s*(\d+)', model_output, flags=re.I)
    if mserves:
        claims["serves"] = int(mserves.group(1))
    # fiber: "5g fiber"
    mfib = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:dietary fiber|fiber)\b', model_output, flags=re.I)
    if mfib:
        claims["fiber"] = float(mfib.group(1))
    # total_fat: "9g fat"
    mfat = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:fat)\b', model_output, flags=re.I)
    if mfat:
        claims["total_fat"] = float(mfat.group(1))
    # carbs: "30g carbs"
    mcarb = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:carb|carbs|carbohydrate|carbohydrates)\b', model_output, flags=re.I)
    if mcarb:
        claims["carbs"] = float(mcarb.group(1))

    # Extract simple ingredients mentioned like "Features chickpeas and tahini"
    # Look for "Features X and Y" or "using X and Y" patterns
    ming = re.search(r'(?:features|using|uses|with)\s+([a-zA-Z0-9\s\-\']+?)\s*(?:,|\band\b|\.)', model_output, flags=re.I)
    if ming:
        # split on 'and' or commas
        text = ming.group(1)
        candidates = re.split(r'\band\b|,', text, flags=re.I)
        ing_list = [c.strip().lower() for c in candidates if c.strip()]

    # fallback: look for "using X and Y." explicit pattern
    m2 = re.search(r'using\s+([^\.,]+?)\.', model_output, flags=re.I)
    if m2:
        parts = re.split(r'\band\b|,', m2.group(1))
        ing_list = [p.strip().lower() for p in parts if p.strip()]

    return title if title else None, claims, ing_list


def parse_instruction_to_constraints(instruction: str) -> Dict[str, Any]:
    """
    Given an instruction string (as generated in variants), return a constraints dict.
    Supports the variant instruction formats you provided.
    """
    instr = instruction.lower()
    constraints = {}

    # calories: "under 400 calories" or "around 400 calories" or "under 400"
    m = re.search(r'under\s+(\d+)', instr)
    if m:
        constraints["calories_max"] = float(m.group(1))
    m2 = re.search(r'around\s+(\d+)', instr)
    if m2:
        val = float(m2.group(1))
        # allow +/- 50 calories tolerance for "around"
        constraints["calories_min"] = max(0.0, val - 50)
        constraints["calories_max"] = val + 50

    # time-based: "less than X minutes"
    mtime = re.search(r'less than\s+(\d+)\s*minutes', instr)
    if mtime:
        constraints["duration_max"] = float(mtime.group(1))

    # protein: "at least Xg protein" or "high-protein at least X"
    mprot = re.search(r'at least\s+(\d+)\s*g\s*protein', instr)
    if mprot:
        constraints["protein_min"] = float(mprot.group(1))
    else:
        # "high-protein recipe with at least 12g protein." or "high-protein with at least 10"
        mprot2 = re.search(r'(\d+)\s*g\s*protein', instr)
        if mprot2 and 'high-protein' in instr:
            constraints["protein_min"] = float(mprot2.group(1))

    # sodium: "under Xmg sodium"
    msod = re.search(r'under\s+(\d+)\s*mg\s*sodium', instr)
    if msod:
        constraints["sodium_max"] = float(msod.group(1))

    # ingredient-based: "using X and Y"
    ming = re.search(r'suggest a recipe using\s+([a-z0-9\s\-\']+?)\s+and\s+([a-z0-9\s\-\']+)', instr)
    if ming:
        constraints["ingredients_include"] = [ming.group(1).strip(), ming.group(2).strip()]

    # fiber: "at least Xg fiber" or "high-fiber at least"
    mf = re.search(r'at least\s+(\d+)\s*g\s*fiber', instr)
    if mf:
        constraints["fiber_min"] = float(mf.group(1))
    mff = re.search(r'high-fiber.*?(\d+)', instr)
    if mff and "fiber" in instr:
        constraints["fiber_min"] = float(mff.group(1))

    # balanced meal: this is generic -> require calories, protein, total_fat present and between some moderate bounds
    if "balanced meal" in instr:
        constraints["balanced"] = True

    # time variant in your code: instruction uses "less than {duration + 10} minutes"
    mquick = re.search(r'less than\s+(\d+)\s*minutes', instr)
    if mquick:
        constraints["duration_max"] = float(mquick.group(1))

    return constraints


def recipe_satisfies_constraints(recipe_row: pd.Series, constraints: Dict[str, Any]) -> bool:
    """
    Returns True if the pandas series (one recipe) satisfies all constraints.
    """
    try:
        # calories
        if "calories_max" in constraints:
            if pd.isna(recipe_row[COLS["calories"]]):
                return False
            if recipe_row[COLS["calories"]] > constraints["calories_max"]:
                return False
        if "calories_min" in constraints:
            if pd.isna(recipe_row[COLS["calories"]]):
                return False
            if recipe_row[COLS["calories"]] < constraints["calories_min"]:
                return False
        # duration
        if "duration_max" in constraints:
            if COLS["duration"] not in recipe_row or pd.isna(recipe_row[COLS["duration"]]):
                return False
            if float(recipe_row[COLS["duration"]]) > constraints["duration_max"]:
                return False
        # protein
        if "protein_min" in constraints:
            if pd.isna(recipe_row[COLS["protein"]]):
                return False
            if recipe_row[COLS["protein"]] < constraints["protein_min"]:
                return False
        # sodium
        if "sodium_max" in constraints:
            if pd.isna(recipe_row[COLS["sodium"]]):
                return False
            if recipe_row[COLS["sodium"]] > constraints["sodium_max"]:
                return False
        # ingredients include
        if "ingredients_include" in constraints:
            ing_field = recipe_row.get(COLS["ingredients"], "")
            if not isinstance(ing_field, str):
                return False
            ing_field_low = ing_field.lower()
            for ing in constraints["ingredients_include"]:
                if ing.lower() not in ing_field_low:
                    return False
        # fiber
        if "fiber_min" in constraints:
            if pd.isna(recipe_row[COLS["fiber"]]):
                return False
            if recipe_row[COLS["fiber"]] < constraints["fiber_min"]:
                return False
        # balanced: require numeric calories/protein/total_fat exists; simple heuristics
        if constraints.get("balanced"):
            if any(pd.isna(recipe_row.get(COLS.get(k), None)) for k in ["calories", "protein", "total_fat"]):
                return False
            # require moderate calories <= 700 and protein >= 10
            if recipe_row[COLS["calories"]] > 700:
                return False
            if recipe_row[COLS["protein"]] < 8:
                return False
        return True
    except Exception as e:
        # if any unexpected error, return False
        return False


def find_any_matching_recipe(df: pd.DataFrame, constraints: Dict[str, Any]) -> Optional[pd.Series]:
    """
    Returns first matching recipe row (as Series) that satisfies constraints, or None.
    """
    # quick vectorized filter approach
    mask = pd.Series(True, index=df.index)

    if "calories_max" in constraints and COLS["calories"] in df.columns:
        mask &= df[COLS["calories"]].fillna(np.inf) <= constraints["calories_max"]
    if "calories_min" in constraints and COLS["calories"] in df.columns:
        mask &= df[COLS["calories"]].fillna(-np.inf) >= constraints["calories_min"]
    if "duration_max" in constraints and COLS["duration"] in df.columns:
        mask &= df[COLS["duration"]].fillna(np.inf) <= constraints["duration_max"]
    if "protein_min" in constraints and COLS["protein"] in df.columns:
        mask &= df[COLS["protein"]].fillna(-np.inf) >= constraints["protein_min"]
    if "sodium_max" in constraints and COLS["sodium"] in df.columns:
        mask &= df[COLS["sodium"]].fillna(np.inf) <= constraints["sodium_max"]
    if "fiber_min" in constraints and COLS["fiber"] in df.columns:
        mask &= df[COLS["fiber"]].fillna(-np.inf) >= constraints["fiber_min"]
    if "ingredients_include" in constraints and COLS["ingredients"] in df.columns:
        for ing in constraints["ingredients_include"]:
            mask &= df[COLS["ingredients"]].str.contains(ing.lower(), na=False)

    # balanced: apply heuristic
    if constraints.get("balanced"):
        mask &= df[COLS["calories"]].fillna(np.inf) <= 700
        mask &= df[COLS["protein"]].fillna(-np.inf) >= 8
        mask &= df[COLS["total_fat"]].fillna(-np.inf) > 0

    filtered = df[mask]
    if len(filtered) == 0:
        return None
    return filtered.iloc[0]


# -------------------------
# Main evaluation
# -------------------------
def evaluate(inferenced_data: Dict[str, Any], df: pd.DataFrame, split_key: str = "test"):
    """
    Expects inferenced_data[split_key] to be [predictions_list, references_list, instructions_list]
    Each element in predictions_list corresponds to a model output. In your variant setup, you might
    have predictions as lists of variants per recipe; we attempt to handle either flat lists or nested.
    """
    preds, refs, insts = inferenced_data[split_key]

    # Normalize lists: if preds is a dict or nested list-of-lists, flatten into per-variant instruction objects.
    # We'll assume 'insts' is parallel: each entry may contain 'variants' list (per your earlier generation),
    # or insts may already be the list of generated 'instruction' strings.
    # We'll construct a list of dicts: { "instruction": ..., "output": ... }
    records = []

    # Case A: you saved a structure of variants per example (list of dicts with "instruction" and "output")
    # We'll detect types and normalize.
    if isinstance(preds, list) and len(preds) > 0 and isinstance(preds[0], dict) and 'output' in preds[0]:
        # already normalized predictions as dicts
        normalized = preds
    else:
        # Attempt to pair insts and preds element-wise
        normalized = []
        # If preds entries are lists of outputs per example and insts are lists of variants (list-of-dicts)
        if isinstance(preds, list) and len(preds) == len(insts):
            for p, i in zip(preds, insts):
                # If i is list of variant dicts
                if isinstance(i, list):
                    # each variant has instruction + output but model output might be in p list in same order
                    # Best effort: if p is list with same length, pair them; otherwise assume p is single str
                    if isinstance(p, list) and len(p) == len(i):
                        for out, var in zip(p, i):
                            normalized.append({"instruction": var.get("instruction") if isinstance(var, dict) else str(var),
                                               "output": out})
                    else:
                        # pair each variant instruction with same p (string)
                        for var in i:
                            normalized.append({"instruction": var.get("instruction") if isinstance(var, dict) else str(var),
                                               "output": p if isinstance(p, str) else str(p)})
                else:
                    # i is a single instruction string
                    normalized.append({"instruction": i, "output": p})
        else:
            # fallback: if insts is list of variant dicts
            if isinstance(insts, list):
                for entry in insts:
                    if isinstance(entry, dict) and 'instruction' in entry and 'output' in entry:
                        normalized.append({"instruction": entry['instruction'], "output": entry['output']})
                    elif isinstance(entry, dict) and 'instruction' in entry:
                        normalized.append({"instruction": entry['instruction'], "output": ""})
                    else:
                        # can't interpret; make best-effort
                        normalized.append({"instruction": str(entry), "output": ""})
            else:
                raise ValueError("Unable to normalize predictions/instructions structure. Inspect your inferenced_data format.")

    # Now evaluate each normalized pair
    for item in normalized:
        instruction = item.get("instruction", "")
        output = item.get("output", "")
        title, claims, mentioned_ings = extract_title_and_claims(output)
        constraints = parse_instruction_to_constraints(instruction)

        title_present = False
        recipe_row = None
        if title:
            # try exact or case-insensitive match in df titles
            matches = df[df[COLS["title"]].str.lower() == title.lower()]
            if len(matches) == 0:
                # try substring match
                matches = df[df[COLS["title"]].str.lower().str.contains(re.escape(title.lower()), na=False)]
            if len(matches) > 0:
                title_present = True
                recipe_row = matches.iloc[0]

        # Check if the title's referenced recipe (if present) satisfies constraints
        satisfied_by_returned = False
        if title_present and recipe_row is not None and constraints:
            satisfied_by_returned = recipe_satisfies_constraints(recipe_row, constraints)
        elif title_present and (not constraints):
            # if no constraints parsed, consider it satisfied_by_returned = True (no constraint)
            satisfied_by_returned = True

        # Check whether any recipe in dataset satisfies constraints (for recall/TP/FN)
        any_matching = True if not constraints else (find_any_matching_recipe(df, constraints) is not None)

        # Hallucination: title not found in dataset
        hallucinated = not title_present

        records.append({
            "instruction": instruction,
            "model_output": output,
            "parsed_title": title,
            "title_in_dataset": title_present,
            "satisfied_by_returned": satisfied_by_returned,
            "dataset_has_any_matching": any_matching,
            "hallucinated": hallucinated,
            "constraints": constraints,
            "claims": claims,
            "mentioned_ingredients": mentioned_ings
        })

    # Compute metrics:
    TP = 0  # dataset has a matching recipe AND model returned an example that satisfied constraints
    FP = 0  # model returned a recipe that DOES NOT satisfy constraints OR hallucination where dataset had none
    FN = 0  # dataset has a matching recipe but model either returned non-matching recipe or hallucinated/no answer

    for r in records:
        if r["dataset_has_any_matching"]:
            # ground truth: there exists an example satisfying constraints
            if r["title_in_dataset"] and r["satisfied_by_returned"]:
                TP += 1
            else:
                FN += 1
        else:
            # dataset has no matching recipe
            if r["title_in_dataset"]:
                # model returned a recipe (but dataset actually doesn't have matching recipe!) -> FP
                FP += 1
            else:
                # model did not return any recipe (and none exists): true negative (not used in precision/recall)
                pass

    # Another kind of FP: returned a recipe but it didn't satisfy constraints (even though dataset has matching recipe).
    # The above counts that as FN (because dataset_has_any_matching true and model didn't return matching) â€” acceptable.
    # Also count hallucinated returns (title not present) as FP when dataset_has_any_matching is False or True
    # For clarity compute hallucination rate:
    total_outputs = len(records)
    halluc_count = sum(1 for r in records if r["hallucinated"])
    halluc_rate = halluc_count / total_outputs if total_outputs > 0 else 0.0

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    summary_df = pd.DataFrame(records)
    summary_df.to_csv(OUTPUT_SUMMARY_CSV, index=False)

    metrics = {
        "total_examples": total_outputs,
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "hallucination_count": halluc_count,
        "hallucination_rate": halluc_rate,
        "summary_csv": OUTPUT_SUMMARY_CSV
    }

    return summary_df, metrics


# -------------------------
# Entry point
# -------------------------
if __name__ == "__main__":
    # Load files
    print("Loading JSON:", JSON_PATH)
    inferenced = load_inputs(JSON_PATH)
    print("Loading hummus CSV:", CSV_PATH)
    df = load_df(CSV_PATH)

    # Choose split to evaluate - 'test' or 'val' - modify if needed
    split_to_eval = "test"
    if split_to_eval not in inferenced:
        # fallback to 'val' or first key
        if "val" in inferenced:
            split_to_eval = "val"
        else:
            split_to_eval = list(inferenced.keys())[0]

    print(f"Evaluating split: {split_to_eval}")
    summary_df, metrics = evaluate(inferenced, df, split_key=split_to_eval)

    print("\n=== Metrics ===")
    for k, v in metrics.items():
        print(f"{k}: {v}")
    print(f"\nDetailed summary saved to {metrics['summary_csv']}")


Loading JSON: ../results/base-inference-r8.json
Loading hummus CSV: ../data/pp_recipes.csv


  df = pd.read_csv(csv_path, index_col=0, low_memory=True)


Evaluating split: test

=== Metrics ===
total_examples: 515
TP: 14
FP: 0
FN: 501
precision: 1.0
recall: 0.027184466019417475
f1: 0.05293005671077505
hallucination_count: 488
hallucination_rate: 0.9475728155339805
summary_csv: constraint_evaluation_summary_base.csv

Detailed summary saved to constraint_evaluation_summary_base.csv


# Fine-tined model evaluation

In [8]:
# Load the model and tokenizer from the Hugging Face Hub
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "azimidokht/recipe-recom-fine-tuned",  # your HF repo
    max_seq_length = 2048,  # or the same you used during training
    dtype = None,           # or "float16"/"bfloat16" depending on your GPU
    load_in_4bit = True,    # set to True if you trained/quantized with 4-bit
)

==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

## inference

In [None]:
inferenced_data_tuned = {}

print("\nGenerating predictions for the validation dataset...")
val_predictions_tuned, val_references_tuned, val_inst_tuned = generate_predictions(val_dataset, model, tokenizer)
print("Generating predictions for the test dataset...")
test_predictions_tuned, test_references_tuned, test_inst_tuned = generate_predictions(test_dataset, model, tokenizer)




Generating predictions for the validation dataset...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 515/515 [18:59<00:00,  2.21s/it]


Generating predictions for the test dataset...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 515/515 [18:49<00:00,  2.19s/it]


Validation Rouge results: {'rouge1': np.float64(0.40453685922545124), 'rouge2': np.float64(0.14331750751282368), 'rougeL': np.float64(0.3867874845240147), 'rougeLsum': np.float64(0.38656809820506405)}
Validation BLEU results: {'bleu': 0.18616916303170597, 'precisions': [0.578895096213532, 0.33156498673740054, 0.12156583629893239, 0.060829493087557605], 'brevity_penalty': 0.9591448031459188, 'length_ratio': 0.9599570968895245, 'translation_length': 8055, 'reference_length': 8391}
Test Rouge results: {'rouge1': np.float64(0.3961609293564492), 'rouge2': np.float64(0.1340261275115724), 'rougeL': np.float64(0.3781028237338431), 'rougeLsum': np.float64(0.37816379311891846)}
Test BLEU results: {'bleu': 0.1753465742384433, 'precisions': [0.5765054294175715, 0.32468045855843986, 0.11082838563754595, 0.05320933069065406], 'brevity_penalty': 0.9619947380591407, 'length_ratio': 0.9626989783796627, 'translation_length': 8104, 'reference_length': 8418}


In [None]:
inferenced_data_tuned["test"] = [val_predictions_tuned, val_references_tuned, val_inst_tuned]
inferenced_data_tuned["val"] = [test_predictions_tuned, test_references_tuned, test_inst_tuned]

store(inferenced_data_tuned, "./fine-tuned-inference-r8.json")

val_results_tuned = evaluate(val_predictions_tuned, val_references_tuned, metrics=["rouge", "bleu"])
print("Validation Rouge results:", val_results_tuned["rouge"])
print("Validation BLEU results:", val_results_tuned["bleu"])

test_results_tuned = evaluate(test_predictions_tuned, test_references_tuned, metrics=["rouge", "bleu"])
print("Test Rouge results:", test_results_tuned["rouge"])
print("Test BLEU results:", test_results_tuned["bleu"])

In [2]:
import json
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd

# CONFIG - change file names / keys if needed
JSON_PATH = "../results/fine-tuned-inference-r8.json"   # <- path to your jsondump
CSV_PATH = "../data/pp_recipes.csv"          # <- hummus recipes CSV
OUTPUT_SUMMARY_CSV = "constraint_evaluation_summary.csv"

# Columns mapping from your description (adjust if different)
COLS = {
    "title": "title",
    "calories": "calories [cal]",
    "protein": "protein [g]",
    "sodium": "sodium [mg]",
    "duration": "duration",           # may be string/object - ensure numeric if possible
    "serves": "serves",
    "total_fat": "totalFat [g]",
    "carbs": "totalCarbohydrate [g]",
    "fiber": "dietaryFiber [g]",
    "ingredients": "ingredients"
}

# -------------------------
# Helper functions
# -------------------------
def load_inputs(json_path: str) -> Dict[str, Any]:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_df(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, index_col=0, low_memory=True)
    # Normalise title for matching
    df[COLS["title"]] = df[COLS["title"]].astype(str).str.strip()
    # Try converting numeric columns
    for c in ["calories", "protein", "sodium", "duration", "total_fat", "carbs", "fiber"]:
        colname = COLS.get(c)
        if colname in df.columns:
            # remove non-numeric and coerce
            df[colname] = pd.to_numeric(df[colname].astype(str).str.replace(r"[^\d\.\-]", "", regex=True), errors="coerce")
    # Normalize ingredients column to lowercase string for containment checks
    if COLS["ingredients"] in df.columns:
        df[COLS["ingredients"]] = df[COLS["ingredients"]].astype(str).str.lower()
    return df


def extract_title_and_claims(model_output: str) -> Tuple[Optional[str], Dict[str, float], List[str]]:
    """
    Extract a claimed title and numeric claims from model output.
    Returns (title, numeric_claims, list_of_mentioned_ingredients)
    Numeric claims keys: calories, protein, sodium, duration, serves, fiber, total_fat, carbs
    This function uses heuristic regex matching on outputs like:
      "Hummus Delight - 350 calories, 15g protein, ready in 20 minutes."
      "Title - Takes 12 minutes, 300.0 calories"
    """
    claims = {}
    ing_list = []

    if not isinstance(model_output, str):
        return None, claims, ing_list

    # Attempt to parse "Title - ..." or "Title: ..." or "Title â€” ..."
    m = re.match(r'^\s*([^\-\â€“\â€”\:]+?)\s*(?:[-\â€“\â€”\:])\s*(.*)$', model_output.strip())
    if m:
        title = m.group(1).strip()
        rest = m.group(2)
    else:
        # fallback: first token phrase before comma
        parts = model_output.split(",")
        title = parts[0].strip()
        rest = ", ".join(parts[1:]) if len(parts) > 1 else ""

    # numeric captures
    # calories: "350 calories" or "350.0 calories"
    mcal = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*(?:calories|cal|kcal)\b', model_output, flags=re.I)
    if mcal:
        claims["calories"] = float(mcal.group(1))
    # protein: "15g protein"
    mprot = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:protein)\b', model_output, flags=re.I)
    if mprot:
        claims["protein"] = float(mprot.group(1))
    # sodium: "180mg sodium"
    msod = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*mg\s*(?:sodium)\b', model_output, flags=re.I)
    if msod:
        claims["sodium"] = float(msod.group(1))
    # duration: "ready in 20 minutes", "Takes 12 minutes"
    mtime = re.search(r'(\d+)\s*(?:minutes|min|mins)\b', model_output, flags=re.I)
    if mtime:
        claims["duration"] = float(mtime.group(1))
    # serves: "serves 4"
    mserves = re.search(r'serves\s*(\d+)', model_output, flags=re.I)
    if mserves:
        claims["serves"] = int(mserves.group(1))
    # fiber: "5g fiber"
    mfib = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:dietary fiber|fiber)\b', model_output, flags=re.I)
    if mfib:
        claims["fiber"] = float(mfib.group(1))
    # total_fat: "9g fat"
    mfat = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:fat)\b', model_output, flags=re.I)
    if mfat:
        claims["total_fat"] = float(mfat.group(1))
    # carbs: "30g carbs"
    mcarb = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*g\s*(?:carb|carbs|carbohydrate|carbohydrates)\b', model_output, flags=re.I)
    if mcarb:
        claims["carbs"] = float(mcarb.group(1))

    # Extract simple ingredients mentioned like "Features chickpeas and tahini"
    # Look for "Features X and Y" or "using X and Y" patterns
    ming = re.search(r'(?:features|using|uses|with)\s+([a-zA-Z0-9\s\-\']+?)\s*(?:,|\band\b|\.)', model_output, flags=re.I)
    if ming:
        # split on 'and' or commas
        text = ming.group(1)
        candidates = re.split(r'\band\b|,', text, flags=re.I)
        ing_list = [c.strip().lower() for c in candidates if c.strip()]

    # fallback: look for "using X and Y." explicit pattern
    m2 = re.search(r'using\s+([^\.,]+?)\.', model_output, flags=re.I)
    if m2:
        parts = re.split(r'\band\b|,', m2.group(1))
        ing_list = [p.strip().lower() for p in parts if p.strip()]

    return title if title else None, claims, ing_list


def parse_instruction_to_constraints(instruction: str) -> Dict[str, Any]:
    """
    Given an instruction string (as generated in variants), return a constraints dict.
    Supports the variant instruction formats you provided.
    """
    instr = instruction.lower()
    constraints = {}

    # calories: "under 400 calories" or "around 400 calories" or "under 400"
    m = re.search(r'under\s+(\d+)', instr)
    if m:
        constraints["calories_max"] = float(m.group(1))
    m2 = re.search(r'around\s+(\d+)', instr)
    if m2:
        val = float(m2.group(1))
        # allow +/- 50 calories tolerance for "around"
        constraints["calories_min"] = max(0.0, val - 50)
        constraints["calories_max"] = val + 50

    # time-based: "less than X minutes"
    mtime = re.search(r'less than\s+(\d+)\s*minutes', instr)
    if mtime:
        constraints["duration_max"] = float(mtime.group(1))

    # protein: "at least Xg protein" or "high-protein at least X"
    mprot = re.search(r'at least\s+(\d+)\s*g\s*protein', instr)
    if mprot:
        constraints["protein_min"] = float(mprot.group(1))
    else:
        # "high-protein recipe with at least 12g protein." or "high-protein with at least 10"
        mprot2 = re.search(r'(\d+)\s*g\s*protein', instr)
        if mprot2 and 'high-protein' in instr:
            constraints["protein_min"] = float(mprot2.group(1))

    # sodium: "under Xmg sodium"
    msod = re.search(r'under\s+(\d+)\s*mg\s*sodium', instr)
    if msod:
        constraints["sodium_max"] = float(msod.group(1))

    # ingredient-based: "using X and Y"
    ming = re.search(r'suggest a recipe using\s+([a-z0-9\s\-\']+?)\s+and\s+([a-z0-9\s\-\']+)', instr)
    if ming:
        constraints["ingredients_include"] = [ming.group(1).strip(), ming.group(2).strip()]

    # fiber: "at least Xg fiber" or "high-fiber at least"
    mf = re.search(r'at least\s+(\d+)\s*g\s*fiber', instr)
    if mf:
        constraints["fiber_min"] = float(mf.group(1))
    mff = re.search(r'high-fiber.*?(\d+)', instr)
    if mff and "fiber" in instr:
        constraints["fiber_min"] = float(mff.group(1))

    # balanced meal: this is generic -> require calories, protein, total_fat present and between some moderate bounds
    if "balanced meal" in instr:
        constraints["balanced"] = True

    # time variant in your code: instruction uses "less than {duration + 10} minutes"
    mquick = re.search(r'less than\s+(\d+)\s*minutes', instr)
    if mquick:
        constraints["duration_max"] = float(mquick.group(1))

    return constraints


def recipe_satisfies_constraints(recipe_row: pd.Series, constraints: Dict[str, Any]) -> bool:
    """
    Returns True if the pandas series (one recipe) satisfies all constraints.
    """
    try:
        # calories
        if "calories_max" in constraints:
            if pd.isna(recipe_row[COLS["calories"]]):
                return False
            if recipe_row[COLS["calories"]] > constraints["calories_max"]:
                return False
        if "calories_min" in constraints:
            if pd.isna(recipe_row[COLS["calories"]]):
                return False
            if recipe_row[COLS["calories"]] < constraints["calories_min"]:
                return False
        # duration
        if "duration_max" in constraints:
            if COLS["duration"] not in recipe_row or pd.isna(recipe_row[COLS["duration"]]):
                return False
            if float(recipe_row[COLS["duration"]]) > constraints["duration_max"]:
                return False
        # protein
        if "protein_min" in constraints:
            if pd.isna(recipe_row[COLS["protein"]]):
                return False
            if recipe_row[COLS["protein"]] < constraints["protein_min"]:
                return False
        # sodium
        if "sodium_max" in constraints:
            if pd.isna(recipe_row[COLS["sodium"]]):
                return False
            if recipe_row[COLS["sodium"]] > constraints["sodium_max"]:
                return False
        # ingredients include
        if "ingredients_include" in constraints:
            ing_field = recipe_row.get(COLS["ingredients"], "")
            if not isinstance(ing_field, str):
                return False
            ing_field_low = ing_field.lower()
            for ing in constraints["ingredients_include"]:
                if ing.lower() not in ing_field_low:
                    return False
        # fiber
        if "fiber_min" in constraints:
            if pd.isna(recipe_row[COLS["fiber"]]):
                return False
            if recipe_row[COLS["fiber"]] < constraints["fiber_min"]:
                return False
        # balanced: require numeric calories/protein/total_fat exists; simple heuristics
        if constraints.get("balanced"):
            if any(pd.isna(recipe_row.get(COLS.get(k), None)) for k in ["calories", "protein", "total_fat"]):
                return False
            # require moderate calories <= 700 and protein >= 10
            if recipe_row[COLS["calories"]] > 700:
                return False
            if recipe_row[COLS["protein"]] < 8:
                return False
        return True
    except Exception as e:
        # if any unexpected error, return False
        return False


def find_any_matching_recipe(df: pd.DataFrame, constraints: Dict[str, Any]) -> Optional[pd.Series]:
    """
    Returns first matching recipe row (as Series) that satisfies constraints, or None.
    """
    # quick vectorized filter approach
    mask = pd.Series(True, index=df.index)

    if "calories_max" in constraints and COLS["calories"] in df.columns:
        mask &= df[COLS["calories"]].fillna(np.inf) <= constraints["calories_max"]
    if "calories_min" in constraints and COLS["calories"] in df.columns:
        mask &= df[COLS["calories"]].fillna(-np.inf) >= constraints["calories_min"]
    if "duration_max" in constraints and COLS["duration"] in df.columns:
        mask &= df[COLS["duration"]].fillna(np.inf) <= constraints["duration_max"]
    if "protein_min" in constraints and COLS["protein"] in df.columns:
        mask &= df[COLS["protein"]].fillna(-np.inf) >= constraints["protein_min"]
    if "sodium_max" in constraints and COLS["sodium"] in df.columns:
        mask &= df[COLS["sodium"]].fillna(np.inf) <= constraints["sodium_max"]
    if "fiber_min" in constraints and COLS["fiber"] in df.columns:
        mask &= df[COLS["fiber"]].fillna(-np.inf) >= constraints["fiber_min"]
    if "ingredients_include" in constraints and COLS["ingredients"] in df.columns:
        for ing in constraints["ingredients_include"]:
            mask &= df[COLS["ingredients"]].str.contains(ing.lower(), na=False)

    # balanced: apply heuristic
    if constraints.get("balanced"):
        mask &= df[COLS["calories"]].fillna(np.inf) <= 700
        mask &= df[COLS["protein"]].fillna(-np.inf) >= 8
        mask &= df[COLS["total_fat"]].fillna(-np.inf) > 0

    filtered = df[mask]
    if len(filtered) == 0:
        return None
    return filtered.iloc[0]


# -------------------------
# Main evaluation
# -------------------------
def evaluate(inferenced_data: Dict[str, Any], df: pd.DataFrame, split_key: str = "test"):
    """
    Expects inferenced_data[split_key] to be [predictions_list, references_list, instructions_list]
    Each element in predictions_list corresponds to a model output. In your variant setup, you might
    have predictions as lists of variants per recipe; we attempt to handle either flat lists or nested.
    """
    preds, refs, insts = inferenced_data[split_key]

    # Normalize lists: if preds is a dict or nested list-of-lists, flatten into per-variant instruction objects.
    # We'll assume 'insts' is parallel: each entry may contain 'variants' list (per your earlier generation),
    # or insts may already be the list of generated 'instruction' strings.
    # We'll construct a list of dicts: { "instruction": ..., "output": ... }
    records = []

    # Case A: you saved a structure of variants per example (list of dicts with "instruction" and "output")
    # We'll detect types and normalize.
    if isinstance(preds, list) and len(preds) > 0 and isinstance(preds[0], dict) and 'output' in preds[0]:
        # already normalized predictions as dicts
        normalized = preds
    else:
        # Attempt to pair insts and preds element-wise
        normalized = []
        # If preds entries are lists of outputs per example and insts are lists of variants (list-of-dicts)
        if isinstance(preds, list) and len(preds) == len(insts):
            for p, i in zip(preds, insts):
                # If i is list of variant dicts
                if isinstance(i, list):
                    # each variant has instruction + output but model output might be in p list in same order
                    # Best effort: if p is list with same length, pair them; otherwise assume p is single str
                    if isinstance(p, list) and len(p) == len(i):
                        for out, var in zip(p, i):
                            normalized.append({"instruction": var.get("instruction") if isinstance(var, dict) else str(var),
                                               "output": out})
                    else:
                        # pair each variant instruction with same p (string)
                        for var in i:
                            normalized.append({"instruction": var.get("instruction") if isinstance(var, dict) else str(var),
                                               "output": p if isinstance(p, str) else str(p)})
                else:
                    # i is a single instruction string
                    normalized.append({"instruction": i, "output": p})
        else:
            # fallback: if insts is list of variant dicts
            if isinstance(insts, list):
                for entry in insts:
                    if isinstance(entry, dict) and 'instruction' in entry and 'output' in entry:
                        normalized.append({"instruction": entry['instruction'], "output": entry['output']})
                    elif isinstance(entry, dict) and 'instruction' in entry:
                        normalized.append({"instruction": entry['instruction'], "output": ""})
                    else:
                        # can't interpret; make best-effort
                        normalized.append({"instruction": str(entry), "output": ""})
            else:
                raise ValueError("Unable to normalize predictions/instructions structure. Inspect your inferenced_data format.")

    # Now evaluate each normalized pair
    for item in normalized:
        instruction = item.get("instruction", "")
        output = item.get("output", "")
        title, claims, mentioned_ings = extract_title_and_claims(output)
        constraints = parse_instruction_to_constraints(instruction)

        title_present = False
        recipe_row = None
        if title:
            # try exact or case-insensitive match in df titles
            matches = df[df[COLS["title"]].str.lower() == title.lower()]
            if len(matches) == 0:
                # try substring match
                matches = df[df[COLS["title"]].str.lower().str.contains(re.escape(title.lower()), na=False)]
            if len(matches) > 0:
                title_present = True
                recipe_row = matches.iloc[0]

        # Check if the title's referenced recipe (if present) satisfies constraints
        satisfied_by_returned = False
        if title_present and recipe_row is not None and constraints:
            satisfied_by_returned = recipe_satisfies_constraints(recipe_row, constraints)
        elif title_present and (not constraints):
            # if no constraints parsed, consider it satisfied_by_returned = True (no constraint)
            satisfied_by_returned = True

        # Check whether any recipe in dataset satisfies constraints (for recall/TP/FN)
        any_matching = True if not constraints else (find_any_matching_recipe(df, constraints) is not None)

        # Hallucination: title not found in dataset
        hallucinated = not title_present

        records.append({
            "instruction": instruction,
            "model_output": output,
            "parsed_title": title,
            "title_in_dataset": title_present,
            "satisfied_by_returned": satisfied_by_returned,
            "dataset_has_any_matching": any_matching,
            "hallucinated": hallucinated,
            "constraints": constraints,
            "claims": claims,
            "mentioned_ingredients": mentioned_ings
        })

    # Compute metrics:
    TP = 0  # dataset has a matching recipe AND model returned an example that satisfied constraints
    FP = 0  # model returned a recipe that DOES NOT satisfy constraints OR hallucination where dataset had none
    FN = 0  # dataset has a matching recipe but model either returned non-matching recipe or hallucinated/no answer

    for r in records:
        if r["dataset_has_any_matching"]:
            # ground truth: there exists an example satisfying constraints
            if r["title_in_dataset"] and r["satisfied_by_returned"]:
                TP += 1
            else:
                FN += 1
        else:
            # dataset has no matching recipe
            if r["title_in_dataset"]:
                # model returned a recipe (but dataset actually doesn't have matching recipe!) -> FP
                FP += 1
            else:
                # model did not return any recipe (and none exists): true negative (not used in precision/recall)
                pass

    # Another kind of FP: returned a recipe but it didn't satisfy constraints (even though dataset has matching recipe).
    # The above counts that as FN (because dataset_has_any_matching true and model didn't return matching) â€” acceptable.
    # Also count hallucinated returns (title not present) as FP when dataset_has_any_matching is False or True
    # For clarity compute hallucination rate:
    total_outputs = len(records)
    halluc_count = sum(1 for r in records if r["hallucinated"])
    halluc_rate = halluc_count / total_outputs if total_outputs > 0 else 0.0

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    summary_df = pd.DataFrame(records)
    summary_df.to_csv(OUTPUT_SUMMARY_CSV, index=False)

    metrics = {
        "total_examples": total_outputs,
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "hallucination_count": halluc_count,
        "hallucination_rate": halluc_rate,
        "summary_csv": OUTPUT_SUMMARY_CSV
    }

    return summary_df, metrics


# -------------------------
# Entry point
# -------------------------
if __name__ == "__main__":
    # Load files
    print("Loading JSON:", JSON_PATH)
    inferenced = load_inputs(JSON_PATH)
    print("Loading hummus CSV:", CSV_PATH)
    df = load_df(CSV_PATH)

    # Choose split to evaluate - 'test' or 'val' - modify if needed
    split_to_eval = "test"
    if split_to_eval not in inferenced:
        # fallback to 'val' or first key
        if "val" in inferenced:
            split_to_eval = "val"
        else:
            split_to_eval = list(inferenced.keys())[0]

    print(f"Evaluating split: {split_to_eval}")
    summary_df, metrics = evaluate(inferenced, df, split_key=split_to_eval)

    print("\n=== Metrics ===")
    for k, v in metrics.items():
        print(f"{k}: {v}")
    print(f"\nDetailed summary saved to {metrics['summary_csv']}")


Loading JSON: ../results/fine-tuned-inference-r8.json
Loading hummus CSV: ../data/pp_recipes.csv


  df = pd.read_csv(csv_path, index_col=0, low_memory=True)


Evaluating split: test

=== Metrics ===
total_examples: 515
TP: 172
FP: 0
FN: 343
precision: 1.0
recall: 0.3339805825242718
f1: 0.5007278020378456
hallucination_count: 121
hallucination_rate: 0.23495145631067962
summary_csv: constraint_evaluation_summary.csv

Detailed summary saved to constraint_evaluation_summary.csv
