In [23]:
import pandas as pd
import re

# -------- Protein terms --------
protein_terms = [
    "chicken", "turkey", "beef", "lamb", "pork",
    "salmon", "tuna", "shrimp", "cod", "tilapia",
    "egg", "eggs", "egg white",
    "tofu", "tempeh",
    "cottage cheese", "greek yogurt", "skyr",
    "whey", "protein powder"
]

# -------- Carb terms --------
carb_terms = [
    "rice","potato","bread","pasta","spaghetti","tortilla",
    "quinoa","oats","barley","couscous","noodle","ramen",
    "honey","sugar","maple syrup","brown sugar","corn syrup"
]

def detect_terms(lst, keywords):
    found = []
    for ing in lst:
        s = ing.lower()
        for kw in keywords:
            if kw in s:
                found.append(kw)
    return ", ".join(sorted(set(found))) if found else ""

def approx_protein_category(ingredients):
    text = " ".join(ingredients).lower()
    score = sum(1 for kw in protein_terms if kw in text)
    if score >= 2:
        return "high"
    elif score == 1:
        return "moderate"
    else:
        return "low"

def approx_carb_category(ingredients):
    text = " ".join(ingredients).lower()
    score = sum(1 for kw in carb_terms if kw in text)
    if score == 0:
        return "low"
    elif score == 1:
        return "moderate"
    else:
        return "high"

def approx_meets_constraints(protein_cat, carb_cat):
    # simple heuristic: high/moderate protein and not high carb
    return (protein_cat in ["moderate", "high"]) and (carb_cat != "high")

def improve_steps(title):
    return (
        f"Prepare the main protein for {title.lower()} by seasoning and cooking it thoroughly. "
        f"Limit or replace high-carb sides with low-carb options like vegetables. "
        f"Keep the cooking method simple and focused on the protein."
    )

def teacher(row):
    ing = row["ingredients_raw"]  # list of ingredients

    result = {}
    
    # protein source
    result["protein_primary_source"] = detect_terms(ing, protein_terms) or ""
    
    # carbs
    result["carb_sources"] = detect_terms(ing, carb_terms)
    
    # categories (approx based only on ingredients)
    p_cat = approx_protein_category(ing)
    c_cat = approx_carb_category(ing)
    result["protein_category"] = p_cat
    result["carb_category"] = c_cat
    
    # constraints (approx)
    ok = approx_meets_constraints(p_cat, c_cat)
    result["meets_constraints"] = ok
    result["violation_reason"] = "" if ok else "approximate macro balance violates high-protein/low-carb target"
    
    # modifiable: default True, you judge better than this
    result["can_be_modified"] = True
    result["substitution_notes"] = "Replace high-carb items (bread, rice, pasta, sugar) with low-carb vegetables or omit them."
    
    # improved steps
    result["improved_version_steps"] = improve_steps(row["title"])
    
    # complexity: keep it simple
    result["complexity_score"] = 2
    
    return result


In [24]:
import ast

# Load manually annotated batches
batch1 = pd.read_csv("../data/annotated/platelogic_manual_batch1.csv", encoding="latin1")
batch2 = pd.read_csv("../data/annotated/platelogic_manual_batch2.csv", encoding="latin1")

print("Batch1 rows:", len(batch1))
print("Batch2 rows:", len(batch2))
batch1.head(2)


Batch1 rows: 50
Batch2 rows: 50


Unnamed: 0,recipe_id,title,ingredients_raw,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,63560,granny is licking her chops,"['pork chops', 'mango chutney', 'granny smith ...",pork chops,"mango chutney, apple",moderate,high,False,carbs >20g from chutney and apple,True,Replace mango chutney with lemon-mustard glaze...,"Season pork chops with salt, pepper, and musta...",2
1,165944,crock pot chicken bbq,"['boneless skinless chicken breasts', 'water',...",chicken breast,barbecue sauce,high,low,True,,True,Use sugar-free barbecue sauce or reduce quantity,"Season chicken breast with cumin, vinegar, and...",2


In [25]:
def to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    return []

for df in (batch1, batch2):
    df["ingredients_raw"] = df["ingredients_raw"].apply(to_list)

batch1["ingredients_raw"].iloc[0], type(batch1["ingredients_raw"].iloc[0])


(['pork chops', 'mango chutney', 'granny smith apple', 'sharp cheddar cheese'],
 list)

In [26]:
# Combine both batches into one DataFrame
batches = pd.concat([batch1, batch2], ignore_index=True)
print("Total labeled rows:", len(batches))

# Run teacher on all rows
teacher_outputs = batches.apply(teacher, axis=1, result_type="expand")
teacher_outputs.head(3)


Total labeled rows: 100


Unnamed: 0,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,pork,,moderate,low,True,,True,"Replace high-carb items (bread, rice, pasta, s...",Prepare the main protein for granny is licking...,2
1,chicken,,moderate,low,True,,True,"Replace high-carb items (bread, rice, pasta, s...",Prepare the main protein for crock pot chicken...,2
2,"beef, pork",,high,low,True,,True,"Replace high-carb items (bread, rice, pasta, s...",Prepare the main protein for savory smothered ...,2


In [27]:
cols_to_compare = [
    "protein_primary_source",
    "carb_sources",
    "protein_category",
    "carb_category",
    "meets_constraints"
]


In [28]:
results = {}

for col in cols_to_compare:
    human = batches[col].astype(str).str.strip().str.lower()
    auto = teacher_outputs[col].astype(str).str.strip().str.lower()
    accuracy = (human == auto).mean()
    results[col] = accuracy

results


{'protein_primary_source': np.float64(0.16),
 'carb_sources': np.float64(0.06),
 'protein_category': np.float64(0.4),
 'carb_category': np.float64(0.63),
 'meets_constraints': np.float64(0.64)}

In [29]:
disagreements = []

for col in cols_to_compare:
    human = batches[col].astype(str).str.strip().str.lower()
    auto = teacher_outputs[col].astype(str).str.strip().str.lower()
    mask = human != auto
    
    temp = batches.loc[mask, [
        "recipe_id","title","ingredients_raw",
        "protein_primary_source","carb_sources",
        "protein_category","carb_category","meets_constraints"
    ]].copy()
    
    temp[f"teacher_{col}"] = teacher_outputs.loc[mask, col].values
    temp["disagreed_column"] = col
    disagreements.append(temp)

if disagreements:
    qa_report = pd.concat(disagreements, ignore_index=True)

# Convert lists to strings so drop_duplicates works
    for col in qa_report.columns:
        qa_report[col] = qa_report[col].apply(lambda x: str(x) if isinstance(x, list) else x)

    qa_report = qa_report.drop_duplicates().reset_index(drop=True)
else:
    qa_report = pd.DataFrame()

qa_report.to_csv("../data/annotated/platelogic_QA_report.csv", index=False)
print("Saved QA report with rows:", len(qa_report))


Saved QA report with rows: 311


In [30]:
for col in cols_to_compare:
    human = batches[col].astype(str).str.strip().str.lower()
    auto = teacher_outputs[col].astype(str).str.strip().str.lower()
    acc = (human == auto).mean()
    mismatches = (human != auto).sum()
    print(f"{col}: accuracy={acc:.2f}, mismatches={mismatches}")


protein_primary_source: accuracy=0.16, mismatches=84
carb_sources: accuracy=0.06, mismatches=94
protein_category: accuracy=0.40, mismatches=60
carb_category: accuracy=0.63, mismatches=37
meets_constraints: accuracy=0.64, mismatches=36
