In [None]:
import json
import ast
import random
import warnings 
import re
import pandas as pd
from pathlib import Path
from typing import List

In [None]:
IN_PATH  = "full_dataset.csv"
OUT_PATH = "full_dataset_clean.csv"

def collapse_ws(s: str) -> str:
    s = re.sub(r"[\u0000-\u001F\u007F]", " ", s)
    s = re.sub(r"\\\S+", "", s)
    s = s.encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_title(s: str) -> str:
    s = collapse_ws(s)
    s = re.sub(r"[^A-Za-z &\-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_field(s: str) -> str:
    return collapse_ws(s)

df = pd.read_csv(IN_PATH, dtype=str).rename(columns=str.strip)

df = df.drop(columns=[c for c in ["link", "source"] if c in df.columns], errors="ignore")

for c in ["id","title","ingredients","directions","ner"]:
    if c not in df.columns:
        df[c] = pd.NA

for c in ["ingredients","directions","ner"]:
    df[c] = df[c].astype(str).map(clean_field)

df["title"] = df["title"].astype(str).map(clean_title)

kept_cols = ["id","title","ingredients","directions","ner"]
df = df.replace({c: r"^\s*$" for c in kept_cols}, {c: pd.NA for c in kept_cols}, regex=True)
df = df.dropna(subset=kept_cols, how="any")

df = df[~df["title"].duplicated(keep="first")].copy()

orig_id = pd.to_numeric(df["id"], errors="coerce")
if orig_id.notna().any():
    df = df.assign(_orig_id=orig_id).sort_values("_orig_id").drop(columns="_orig_id")

df = df.reset_index(drop=True)
df["id"] = df.index + 1

df.to_csv(OUT_PATH, index=False)
print(f"Cleaned file saved as {OUT_PATH} (rows: {len(df)})")

	•	Load raw CSV of recipes.
	•	Clean text fields: remove strange characters, collapse whitespace, and restrict titles to letters plus a few symbols.
	•	Drop unwanted data: remove empty rows and duplicate titles.
	•	Ensure consistent columns: add missing ones if necessary.
	•	Reassign IDs sequentially.
	•	Save cleaned dataset to a new CSV.
	•	Print summary of how many rows remain.

In [None]:
IN_PATH  = "full_dataset_clean.csv"
OUT_PATH = "full_dataset_items_deduplicated.csv"

def dedup_items(list_str: str) -> str:
    if pd.isna(list_str):
        return list_str
    try:
        items = ast.literal_eval(list_str)
        if not isinstance(items, list):
            return list_str
    except Exception:
        return list_str

    seen = set()
    out = []
    for item in items:
        if item not in seen:
            seen.add(item)
            out.append(item)

    return str(out)

def main():
    df = pd.read_csv(IN_PATH, dtype=str)

    for col in ["ingredients", "ner"]:
        if col in df.columns:
            df[col] = df[col].map(dedup_items)

    df.to_csv(OUT_PATH, index=False)
    print(f"Item-deduplicated dataset saved as {OUT_PATH}")

if __name__ == "__main__":
    main()

	•	Load cleaned dataset from CSV.
	•	Deduplicate items in ingredients and ner lists while keeping original order.
	•	Update dataset so each list only has unique entries.
	•	Save results to a new CSV file.
	•	Print confirmation with the output file name.

In [None]:
IN_PATH  = "full_dataset_items_deduplicated.csv"
OUT_PATH = "full_dataset_titleshort.csv"

df = pd.read_csv(IN_PATH, dtype=str)

def clean_title(title: str) -> str:
    if not isinstance(title, str):
        return title
    words = title.split()
    cleaned = []
    for w in words:
        if len(w) == 1 and w not in {"&", "-"}:
            continue
        if "serving" in w.lower():
            continue
        cleaned.append(w)
    return " ".join(cleaned).strip()

df["title"] = df["title"].map(clean_title)

df = df[df["title"].str.strip() != ""]

df["title_wordcount"] = df["title"].str.split().str.len()
df = df[df["title_wordcount"] <= 5].copy()

df = df.drop(columns=["title_wordcount"])

df.to_csv(OUT_PATH, index=False)
print(f"Filtered dataset saved as {OUT_PATH} (rows: {len(df)})")

	•	Load dataset from CSV.
	•	Clean titles by removing single-character words (except & and -) and words containing “serving.”
	•	Filter titles: drop rows with empty titles or titles longer than five words.
	•	Save filtered dataset with shortened titles to a new CSV.
	•	Print summary showing how many rows remain.

In [None]:
IN_PATH  = "full_dataset_titleshort.csv"
OUT_PATH = "full_dataset_titleshort_cleaned.csv"

INVALID = {
    "and","or","with","to","of","for","the","a","an","on","in","into","plus","minus","about","around","between",

    "large","small","medium","extra","extra large","jumbo","tiny","mini","heaping","scant","level","packed","lightly","loosely","firmly",
    
    "chopped","sliced","diced","minced","crushed","grated","shredded","peeled","seeded","halved","quartered","cubed","julienned","beaten","whipped",
    "cooked","uncooked","raw","fresh","frozen","ripe","optional","prepared","reserve","divided","needed","taste","as","each","any","all",
    
    "","-","–","—","&","/","\\","(",")","[","]","{","}","#","%","+","*",".",",",":",";","'","\"","`","…",
    
    "portion","portions","serving","servings","slice","slices","piece","pieces","whole","half","third","quarters",
    "container","carton","bag","pack","packet","box","loaf","stick","sticks","head","heads"
}

def drop_invalid(items):
    return [i for i in items if i.lower().strip() not in INVALID]

def parse_list(s):
    if pd.isna(s):
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        try:
            return json.loads(s)
        except Exception:
            return []

def process_column(series):
    cleaned = []
    for s in series:
        items = parse_list(s)
        items = drop_invalid(items)
        cleaned.append(items)
    return cleaned

def main():
    df = pd.read_csv(IN_PATH, dtype=str)

    for col in ["ingredients", "ner"]:
        if col in df.columns:
            df[col] = process_column(df[col])

    df = df[df["ingredients"].map(len) > 0]
    df = df[df["ner"].map(len) > 0]

    for col in ["ingredients", "ner"]:
        df[col] = df[col].map(lambda x: json.dumps(x, ensure_ascii=False))

    df.to_csv(OUT_PATH, index=False)
    print(f"Cleaned dataset saved as {OUT_PATH} (rows: {len(df)})")

if __name__ == "__main__":
    main()

	•	Cleans the ingredients and ner list fields.
	•	Removes invalid words (sizes, formatting symbols, empty tokens).
	•	Drops rows where either list becomes empty.
	•	Saves the cleaned dataset back into JSON-encoded lists in a new CSV.
	•	Reports how many rows remain.

In [None]:
CSV_IN = Path("full_dataset_titleshort_cleaned.csv")
WHITELIST_TXT = Path("ingredient_vocab_whitelist.txt")
CSV_OUT = Path("full_dataset_titleshort_filtered.csv")

with open(WHITELIST_TXT, "r", encoding="utf-8") as f:
    WHITELIST = [line.strip().lower() for line in f if line.strip()]

def parse_ner(cell):
    if pd.isna(cell):
        return []
    try:
        items = ast.literal_eval(cell)
    except Exception:
        items = [p.strip() for p in str(cell).strip("[]").split(",")]
    return [str(x).strip().strip('"').strip("'").lower()
            for x in items if str(x).strip()]

def token_in_whitelist(tok):
    return any(tok in w for w in WHITELIST)

def row_ok(tokens):
    return bool(tokens) and all(token_in_whitelist(t) for t in tokens)

df = pd.read_csv(CSV_IN)
if "ner" not in df.columns:
    raise ValueError("Input CSV must have a 'ner' column.")

ner_parsed = df["ner"].apply(parse_ner)
keep_mask = ner_parsed.apply(row_ok)
df_filtered = df[keep_mask].copy()

df_filtered.to_csv(CSV_OUT, index=False)

print(f"Input rows:   {len(df)}")
print(f"Kept rows:    {len(df_filtered)}")
print(f"Dropped rows: {len(df) - len(df_filtered)}")
print(f"Saved to:     {CSV_OUT}")

	•	Filters the dataset to keep only rows where all ner tokens match entries in a whitelist of allowed ingredient terms.
	•	Parses the ner column into tokens.
	•	Checks each token against the whitelist.
	•	Drops rows with disallowed tokens.
	•	Saves the filtered dataset to a new CSV.
	•	Reports how many rows were kept or removed.

In [None]:
CSV_IN = Path("full_dataset_titleshort_filtered.csv")  # input CSV
TRAIN_OUT = Path("train.jsonl")  # training output
VAL_OUT = Path("val.jsonl")  # validation output
VAL_RATIO = 0.02  # fraction of data for validation
SEED = 42  # random seed for reproducibility

# Parse a CSV cell into a Python list (robust to JSON, Python list, or comma-separated text)
def parse_list(cell) -> List[str]:
    if pd.isna(cell):
        return []
    s = str(cell).strip()
    if s.startswith("[") and s.endswith("]"):
        try:
            v = json.loads(s)
            return v if isinstance(v, list) else []
        except Exception:
            pass
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", SyntaxWarning)
            try:
                v = ast.literal_eval(s)
                return v if isinstance(v, list) else []
            except Exception:
                pass
    return [p.strip() for p in s.split(",") if p.strip()]

# Normalize an ingredient string (lowercase, remove quotes, clean whitespace)
def norm_ing(item: str) -> str:
    s = str(item).strip().strip('"').strip("'")
    return " ".join(s.lower().split())

# Split directions text into clean steps (handles blobs, sentences, or line breaks)
def re_split_keep(text: str) -> List[str]:
    import re
    parts = [p.strip() for p in text.splitlines() if p.strip()]
    if len(parts) > 1:
        return parts
    return [p.strip() for p in re.split(r"\s*(?<=\.|\!|\?)\s+", text) if p.strip()] # split blob of text in clean sentences using symbols.

# Build a formatted output block with title and numbered directions
def build_output(title: str, directions: List[str]) -> str:
    steps = [str(x).strip() for x in directions if str(x).strip()]
    if len(steps) == 1 and ("." in steps[0] or "\n" in steps[0]):
        blob = steps[0].replace("\\n", "\n")
        steps = [p for p in re_split_keep(blob) if p]
    numbered = "\n".join(f"{i+1}. {step}" for i, step in enumerate(steps))
    return f"Title: {title}\nDirections:\n{numbered}"

def to_llama_format_text(instruction: str, ings: List[str], output: str) -> str:
    user = f"{instruction}\nIngredients: {', '.join(ings)}"
    return f"User:\n{user}\n\nAssistant:\n{output}\n"

def main():
    random.seed(SEED)
    df = pd.read_csv(CSV_IN, dtype=str)

    # Ensure required columns exist
    needed = {"title", "directions", "ner"}
    if not needed.issubset(df.columns):
        raise ValueError("CSV missing required columns")

    instruction = "Generate a recipe (title + steps) using the given ingredients."
    examples_text = []

    # Build dataset examples row by row
    for _, row in df.iterrows():
        title = (row.get("title") or "").strip()
        if not title:
            continue

        directions = parse_list(row.get("directions"))
        if not directions:
            continue

        ings = [norm_ing(x) for x in parse_list(row.get("ner")) if str(x).strip()]
        ings_unique = list(dict.fromkeys(ings))  # remove duplicates, keep order
        if len(ings_unique) < 2:
            continue

        output = build_output(title, directions)
        examples_text.append(to_llama_format_text(instruction, ings_unique, output))

    # Split into train/val sets
    random.shuffle(examples_text)
    n_val = max(1, int(len(examples_text) * VAL_RATIO))
    val, train = examples_text[:n_val], examples_text[n_val:]

    # Save to JSONL
    with TRAIN_OUT.open("w", encoding="utf-8") as f_tr:
        for t in train:
            f_tr.write(json.dumps({"text": t}, ensure_ascii=False) + "\n")
    with VAL_OUT.open("w", encoding="utf-8") as f_va:
        for t in val:
            f_va.write(json.dumps({"text": t}, ensure_ascii=False) + "\n")

    print(f"Total examples: {len(examples_text)}")
    print(f"Train: {len(train)}  -> {TRAIN_OUT}")
    print(f"Val:   {len(val)}    -> {VAL_OUT}")

if __name__ == "__main__":
    main()

	•	Load dataset from the filtered CSV and check required columns (title, directions, ner).
	•	Parse list-like fields (directions, ner) safely using JSON or ast.literal_eval.
	•	Normalize ingredients: lowercase, strip spaces/quotes, and remove duplicates while preserving order.
	•	Skip invalid rows: require a non-empty title, non-empty directions, and at least 2 unique ingredients.
	•	Format directions into a clean, numbered step list (splitting long text into sentences/lines if needed).
	•	Build llama-style text with User: (instruction + ingredients) and Assistant: (title + steps).
	•	Shuffle dataset with a fixed random seed for reproducibility.
	•	Split into train/validation sets using VAL_RATIO.
	•	Write outputs to train.jsonl and val.jsonl, one JSON object per line with a "text" field.
	•	Print summary of total, train, and validation example counts.