In [17]:
import pandas as pd
import ast

# Load pool of 300 candidates
pool = pd.read_csv("../data/cleaned/platelogic_annotation_pool_300.csv")

# Load manual batches
batch1 = pd.read_csv("../data/annotated/platelogic_manual_batch1.csv")
batch2 = pd.read_csv("../data/annotated/platelogic_manual_batch2.csv", encoding="latin1")

manual = pd.concat([batch1, batch2], ignore_index=True)

print("Pool size:", len(pool))
print("Manual size:", len(manual))
manual.head(3)


Pool size: 300
Manual size: 100


Unnamed: 0,recipe_id,title,ingredients_raw,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,63560,granny is licking her chops,"['pork chops', 'mango chutney', 'granny smith ...",pork chops,"mango chutney, apple",moderate,high,False,carbs >20g from chutney and apple,True,Replace mango chutney with lemon-mustard glaze...,"Season pork chops with salt, pepper, and musta...",2
1,165944,crock pot chicken bbq,"['boneless skinless chicken breasts', 'water',...",chicken breast,barbecue sauce,high,low,True,,True,Use sugar-free barbecue sauce or reduce quantity,"Season chicken breast with cumin, vinegar, and...",2
2,193587,savory smothered pork chops,"['pork chops', 'salt', 'pepper', 'fresh rosema...",pork chop,shallots,high,low,True,,True,Replace butter with olive oil to lower saturat...,Season pork chops with salt and pepper. Sear i...,2


In [18]:
protein_terms = [
    "chicken","turkey","beef","lamb","pork",
    "salmon","tuna","shrimp","cod","tilapia",
    "egg","eggs","egg white",
    "tofu","tempeh",
    "cottage cheese","greek yogurt","skyr",
    "whey","protein powder"
]

carb_terms = [
    "rice","potato","bread","pasta","spaghetti","tortilla",
    "quinoa","oats","barley","couscous","noodle","ramen",
    "honey","sugar","maple syrup","brown sugar","corn syrup"
]

def to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return [x]
    return []

def detect_terms(lst, keywords):
    found = []
    for ing in lst:
        s = str(ing).lower()
        for kw in keywords:
            if kw in s:
                found.append(kw)
    return ", ".join(sorted(set(found))) if found else ""

def protein_category(protein):
    if protein >= 45:
        return "high"
    elif protein >= 30:
        return "moderate"
    return "low"

def carb_category(carbs):
    if carbs <= 8:
        return "low"
    elif carbs <= 15:
        return "moderate"
    return "high"

def meets_constraints(protein, carbs):
    return (protein >= 30) and (carbs <= 20)

def improve_steps(title):
    return (
        f"Season the main protein and cook until done. "
        f"Use low-carb sides and avoid added sugars in this {title.lower()} recipe."
    )

def teacher_for_pool(row):
    ing_list = to_list(row["ingredients"])
    protein = row["protein"]
    carbs = row["carbs"]

    out = {}
    out["recipe_id"] = int(row["id"])
    out["title"] = row["name"]
    out["ingredients_raw"] = ing_list

    out["protein_primary_source"] = detect_terms(ing_list, protein_terms)
    out["carb_sources"] = detect_terms(ing_list, carb_terms)
    out["protein_category"] = protein_category(protein)
    out["carb_category"] = carb_category(carbs)

    ok = meets_constraints(protein, carbs)
    out["meets_constraints"] = ok
    out["violation_reason"] = "" if ok else "carbs/protein ratio violates constraints"

    out["can_be_modified"] = True
    out["substitution_notes"] = "Replace high-carb items with low-carb alternatives where present."
    out["improved_version_steps"] = improve_steps(row["name"])
    out["complexity_score"] = 2

    return out


In [12]:
teacher_rows = pool.apply(teacher_for_pool, axis=1, result_type="expand")

print("Teacher-labeled rows:", len(teacher_rows))
teacher_rows.head(3)


Teacher-labeled rows: 300


Unnamed: 0,recipe_id,title,ingredients_raw,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,63560,granny is licking her chops,"[pork chops, mango chutney, granny smith apple...",pork,,high,moderate,True,,True,Replace high-carb items with low-carb alternat...,Season the main protein and cook until done. U...,2
1,165944,crock pot chicken bbq,"[boneless skinless chicken breasts, water, wor...",chicken,,high,high,True,,True,Replace high-carb items with low-carb alternat...,Season the main protein and cook until done. U...,2
2,193587,savory smothered pork chops,"[pork chops, salt, pepper, fresh rosemary, oli...","beef, pork",,moderate,low,True,,True,Replace high-carb items with low-carb alternat...,Season the main protein and cook until done. U...,2


In [19]:
# Ensure recipe_id is integer on both sides
teacher_rows["recipe_id"] = teacher_rows["recipe_id"].astype(int)
manual["recipe_id"] = manual["recipe_id"].astype(int)

print("Unique recipe_ids in teacher:", teacher_rows["recipe_id"].nunique())
print("Unique recipe_ids in manual:", manual["recipe_id"].nunique())


Unique recipe_ids in teacher: 300
Unique recipe_ids in manual: 100


In [20]:
# Merge teacher + manual
combined = teacher_rows.merge(
    manual,
    on="recipe_id",
    how="left",
    suffixes=("_teacher", "_manual")
)

combined.head(3)


Unnamed: 0,recipe_id,title_teacher,ingredients_raw_teacher,protein_primary_source_teacher,carb_sources_teacher,protein_category_teacher,carb_category_teacher,meets_constraints_teacher,violation_reason_teacher,can_be_modified_teacher,...,protein_primary_source_manual,carb_sources_manual,protein_category_manual,carb_category_manual,meets_constraints_manual,violation_reason_manual,can_be_modified_manual,substitution_notes_manual,improved_version_steps_manual,complexity_score_manual
0,63560,granny is licking her chops,"[pork chops, mango chutney, granny smith apple...",pork,,high,moderate,True,,True,...,pork chops,"mango chutney, apple",moderate,high,False,carbs >20g from chutney and apple,True,Replace mango chutney with lemon-mustard glaze...,"Season pork chops with salt, pepper, and musta...",2.0
1,165944,crock pot chicken bbq,"[boneless skinless chicken breasts, water, wor...",chicken,,high,high,True,,True,...,chicken breast,barbecue sauce,high,low,True,,True,Use sugar-free barbecue sauce or reduce quantity,"Season chicken breast with cumin, vinegar, and...",2.0
2,193587,savory smothered pork chops,"[pork chops, salt, pepper, fresh rosemary, oli...","beef, pork",,moderate,low,True,,True,...,pork chop,shallots,high,low,True,,True,Replace butter with olive oil to lower saturat...,Season pork chops with salt and pepper. Sear i...,2.0


In [22]:
final_cols = [
    "recipe_id",
    "title",
    "ingredients_raw",
    "protein_primary_source",
    "carb_sources",
    "protein_category",
    "carb_category",
    "meets_constraints",
    "violation_reason",
    "can_be_modified",
    "substitution_notes",
    "improved_version_steps",
    "complexity_score"
]

final = pd.DataFrame()
final["recipe_id"] = combined["recipe_id"]

for col in final_cols[1:]:
    teacher_col = f"{col}_teacher"
    manual_col = f"{col}_manual"

    # Some fields exist only on teacher side in this notebook (title, ingredients_raw)
    if manual_col in combined.columns:
        teacher_vals = combined[teacher_col] if teacher_col in combined.columns else None
        manual_vals = combined[manual_col]

        if teacher_col in combined.columns:
            final[col] = manual_vals.combine_first(teacher_vals)
        else:
            final[col] = manual_vals
    else:
        # No manual override: just take teacher
        final[col] = combined[teacher_col]

print("Final annotated rows:", len(final))
final.head(5)


Final annotated rows: 300


Unnamed: 0,recipe_id,title,ingredients_raw,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,63560,granny is licking her chops,"['pork chops', 'mango chutney', 'granny smith ...",pork chops,"mango chutney, apple",moderate,high,False,carbs >20g from chutney and apple,True,Replace mango chutney with lemon-mustard glaze...,"Season pork chops with salt, pepper, and musta...",2.0
1,165944,crock pot chicken bbq,"['boneless skinless chicken breasts', 'water',...",chicken breast,barbecue sauce,high,low,True,,True,Use sugar-free barbecue sauce or reduce quantity,"Season chicken breast with cumin, vinegar, and...",2.0
2,193587,savory smothered pork chops,"['pork chops', 'salt', 'pepper', 'fresh rosema...",pork chop,shallots,high,low,True,,True,Replace butter with olive oil to lower saturat...,Season pork chops with salt and pepper. Sear i...,2.0
3,292245,hook line and simple,"['salmon fillets', 'salt & freshly ground blac...",salmon,maple syrup,high,low,False,added sugar from maple syrup; carbs exceed limit,True,Remove maple syrup; replace with a tiny amount...,"Season salmon with salt and pepper, whisk sour...",2.0
4,142050,the best parmesan chicken,"['butter', 'dijon mustard', 'worcestershire sa...",chicken breast,breadcrumbs,high,moderate,True,,True,Replace breadcrumbs with almond flour or crush...,"Mix dijon mustard, Worcestershire, and melted ...",2.0


In [25]:
output_path = "../data/annotated/platelogic_annotated_300.csv"
final.to_csv(output_path, index=False)
print("✅ Saved final annotated dataset:", output_path)


✅ Saved final annotated dataset: ../data/annotated/platelogic_annotated_300.csv


In [28]:
import pandas as pd
import json

df = pd.read_csv("../data/annotated/platelogic_annotated_300.csv")
print(len(df))
df.head(3)


300


Unnamed: 0,recipe_id,title,ingredients_raw,protein_primary_source,carb_sources,protein_category,carb_category,meets_constraints,violation_reason,can_be_modified,substitution_notes,improved_version_steps,complexity_score
0,63560,granny is licking her chops,"['pork chops', 'mango chutney', 'granny smith ...",pork chops,"mango chutney, apple",moderate,high,False,carbs >20g from chutney and apple,True,Replace mango chutney with lemon-mustard glaze...,"Season pork chops with salt, pepper, and musta...",2.0
1,165944,crock pot chicken bbq,"['boneless skinless chicken breasts', 'water',...",chicken breast,barbecue sauce,high,low,True,,True,Use sugar-free barbecue sauce or reduce quantity,"Season chicken breast with cumin, vinegar, and...",2.0
2,193587,savory smothered pork chops,"['pork chops', 'salt', 'pepper', 'fresh rosema...",pork chop,shallots,high,low,True,,True,Replace butter with olive oil to lower saturat...,Season pork chops with salt and pepper. Sear i...,2.0


In [29]:
def make_input(row):
    title = row["title"]
    ingredients = row["ingredients_raw"]
    steps = ""  # original steps are not included since the teacher dataset removed them

    # Convert ingredients list to readable string
    if isinstance(ingredients, str):
        ingredients = ingredients.strip()
    else:
        ingredients = ", ".join(ingredients)

    prompt = (
        f"Title: {title}\n"
        f"Ingredients: {ingredients}\n\n"
        f"Goal: Rewrite the cooking steps to produce a high-protein, low-carb version "
        f"while keeping the core flavor.\n"
        f"Constraints: avoid added sugars, avoid starchy carbs, prioritize lean protein.\n\n"
        f"Write the improved version steps."
    )

    return prompt


def make_output(row):
    return row["improved_version_steps"]


In [30]:
training_rows = []

for _, row in df.iterrows():
    item = {
        "input": make_input(row),
        "output": make_output(row)
    }
    training_rows.append(item)

out_path = "../data/annotated/training_pairs.jsonl"

with open(out_path, "w", encoding="utf-8") as f:
    for item in training_rows:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Saved training pairs to {out_path}")


Saved training pairs to ../data/annotated/training_pairs.jsonl


In [32]:
with open("../data/annotated/training_pairs.jsonl", "r", encoding="utf-8") as f:
    lines = f.readlines()

print("Total lines:", len(lines))
print("\nSample line:\n", lines[0][:300], "...")


Total lines: 300

Sample line:
 {"input": "Title: granny is licking her chops\nIngredients: ['pork chops', 'mango chutney', 'granny smith apple', 'sharp cheddar cheese']\n\nGoal: Rewrite the cooking steps to produce a high-protein, low-carb version while keeping the core flavor.\nConstraints: avoid added sugars, avoid starchy carb ...
