In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!pip install -q kaggle tqdm pandas numpy nltk unidecode rapidfuzz

In [11]:
# =============================================================
# 0. Environment Preparation
# =============================================================

# If running in Google Colab, uncomment to mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# !pip install -q kaggle tqdm pandas numpy nltk unidecode rapidfuzz

import os, json, re, unicodedata, warnings
from pathlib import Path
from typing import List
from collections import Counter

import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz, process
from unidecode import unidecode

In [12]:
# =============================================================
# 1. Dataset Acquisition & File Detection
# =============================================================

DATA_DIR = Path('/content/drive/MyDrive/recipe1m')
assert DATA_DIR.exists(), f"DATA_DIR does not exist: {DATA_DIR}. Did you unzip the dataset?"

# Candidate filenames used by different Recipe1M mirrors
RECIPE_CANDIDATES = ['recipes.json', 'layer1.json', 'layer1+.json']
INGR_CANDIDATES   = ['ingredients.json', 'det_ingrs.json']

RECIPES_FILE = next((DATA_DIR / f for f in RECIPE_CANDIDATES if (DATA_DIR / f).exists()), None)
INGRS_FILE   = next((DATA_DIR / f for f in INGR_CANDIDATES   if (DATA_DIR / f).exists()), None)

if RECIPES_FILE is None:
    raise FileNotFoundError(
        f"None of {RECIPE_CANDIDATES} found under {DATA_DIR}. "
        "Please verify the extraction path or update DATA_DIR.")
else:
    print(f"✓ Using recipe file: {RECIPES_FILE.name}")

if INGRS_FILE is None:
    warnings.warn(
        f"Ingredients JSON not found (looked for {INGR_CANDIDATES}). "
        "Proceeding without it — only recipe metadata will be parsed.")
else:
    print(f"✓ Using ingredients file: {INGRS_FILE.name}")


✓ Using recipe file: layer1.json
✓ Using ingredients file: det_ingrs.json


In [13]:
# =============================================================
# 2. Load JSON into DataFrame
# =============================================================

def load_recipe1m(json_path: Path) -> pd.DataFrame:
    """Load Recipe1M layer1/recipes JSON into pandas DataFrame, handling both schemas."""
    with open(json_path, 'r', encoding='utf‑8') as f:
        data = json.load(f)

    # Schema A (layer1.json, list of dicts) — official Recipe1M format
    if isinstance(data, list) and 'id' in data[0]:
        records = []
        for r in data:
            records.append({
                'id':            r['id'],
                'title':         r['title'],
                'ingredients':   [i['text'] if isinstance(i, dict) else i for i in r['ingredients']],
                'instructions':  [s['text'] if isinstance(s, dict) else s for s in r['instructions']],
                'partition':     r.get('partition', 'train')
            })
        return pd.DataFrame(records)

    # Schema B (recipes.json, dict keyed by ids)
    if isinstance(data, dict):
        records = []
        for _id, r in data.items():
            records.append({
                'id':            _id,
                'title':         r['title'],
                'ingredients':   [i['text'] if isinstance(i, dict) else i for i in r['ingredients']],
                'instructions':  [s['text'] if isinstance(s, dict) else s for s in r['instructions']],
                'partition':     r.get('partition', 'train')
            })
        return pd.DataFrame(records)

    raise ValueError("Unrecognised JSON schema in recipe file — inspect format manually.")

print("Loading recipes… this may take a minute ⏳")
df = load_recipe1m(RECIPES_FILE)
print(f"Loaded {len(df):,} recipes from {RECIPES_FILE.name}")

Loading recipes… this may take a minute ⏳
Loaded 1,029,720 recipes from layer1.json


In [14]:
# =============================================================
# 3. Knowledge Base
# =============================================================

EXPLICIT_ANIMAL = [
    'chicken','beef','pork','ham','bacon','turkey','duck','lamb','veal',
    'salmon','tuna','shrimp','prawn','crab','lobster','anchovy','sardine',
    'fish','fish sauce','clam','oyster']

DAIRY_EGGS = ['milk','butter','cream','cheese','yogurt','egg','eggs','albumen','whey','casein','ghee']

HIDDEN_ANIMAL = [
    'gelatin','isinglass','carmine','cochineal','lard','tallow','rennet',
    'l‑cysteine','shellac','bone char','anchovy paste']

AMBIGUOUS = ['lecithin','natural flavor','stearic acid','glycerides','mono‑ and diglycerides','emulsifier']

KB = {
    'explicit_animal': [unidecode(t) for t in EXPLICIT_ANIMAL],
    'hidden_animal':   [unidecode(t) for t in HIDDEN_ANIMAL],
    'ambiguous':       [unidecode(t) for t in AMBIGUOUS],
}


In [15]:
# =============================================================
# 4. Normalisation helpers
# =============================================================

def normalise(text: str) -> str:
    text = unidecode(text.lower())
    text = re.sub(r'[^a-z0-9\s\-/]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def ingredient_tokens(ings: List[str]) -> List[str]:
    toks = []
    for ing in ings:
        toks.extend(normalise(ing).split())
    return toks

print("Tokenising ingredients…")
tqdm.pandas()
df['tokens'] = df['ingredients'].progress_apply(ingredient_tokens)

Tokenising ingredients…


100%|██████████| 1029720/1029720 [00:51<00:00, 20172.49it/s]


In [16]:
# =============================================================
# 5. Filtering pipeline (Stages 1–4)
# =============================================================

def stage1_keyword(row) -> bool:
    markers = ['vegetarian','vegan','plant based','meatless']
    title = normalise(row['title'])
    text  = ' '.join(row['tokens'])
    return any(m in title or m in text for m in markers)


def contains_any(tokens: List[str], vocab: List[str]) -> bool:
    return any(tok in vocab for tok in tokens)


def stage2_explicit(row) -> bool:
    return contains_any(row['tokens'], KB['explicit_animal'])


def stage3_hidden(row) -> bool:
    return contains_any(row['tokens'], KB['hidden_animal'])


def stage4_ambiguous(row) -> bool:
    return contains_any(row['tokens'], KB['ambiguous'])

print("Running multi‑stage filter…")
df['stage1']          = df.progress_apply(stage1_keyword, axis=1)
df['exclude_explicit'] = df.progress_apply(stage2_explicit, axis=1)
df['exclude_hidden']   = df.progress_apply(stage3_hidden, axis=1)
df['flag_ambiguous']   = df.progress_apply(stage4_ambiguous, axis=1)

conditions_high   = (~df['exclude_explicit']) & (~df['exclude_hidden']) & (~df['flag_ambiguous'])
conditions_medium = (~df['exclude_explicit']) & (~df['exclude_hidden']) & ( df['flag_ambiguous'])

df_high   = df[conditions_high]
df_medium = df[conditions_medium]

print(f"High‑confidence vegetarian recipes:   {len(df_high):,}")
print(f"Medium‑confidence (ambiguous) recipes: {len(df_medium):,}")

Running multi‑stage filter…


100%|██████████| 1029720/1029720 [00:17<00:00, 59872.27it/s]
100%|██████████| 1029720/1029720 [00:19<00:00, 54156.35it/s]
100%|██████████| 1029720/1029720 [00:16<00:00, 63441.85it/s]
100%|██████████| 1029720/1029720 [00:13<00:00, 79126.51it/s]


High‑confidence vegetarian recipes:   654,664
Medium‑confidence (ambiguous) recipes: 144


In [None]:

SAMPLE_SIZE = 1000  # sample size

# ── ランダムサンプリング ──
df_sample = df_high.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)

# ── 確認表示 ──
print(f"Sampled {len(df_sample)} recipes (SAMPLE_SIZE={SAMPLE_SIZE})")
df_sample.head()

# ── Drive へ保存 ──
import os
out_dir = '/content/drive/MyDrive/veg_recipe1m_subset'
os.makedirs(out_dir, exist_ok=True)
df_sample.to_json(
    f"{out_dir}/veg_recipe1m_sample_{SAMPLE_SIZE}.json",
    orient='records',
    force_ascii=False
)
print(f"Saved sample JSON to {out_dir}/veg_recipe1m_sample_{SAMPLE_SIZE}.json")


In [17]:
# =============================================================
# 6. Save subsets
# =============================================================

OUT_DIR = Path('/content/veg_recipe1m')
OUT_DIR.mkdir(parents=True, exist_ok=True)

df_high.to_json(OUT_DIR / 'veg_recipe1m_high.json',  orient='records', force_ascii=False)
df_medium.to_json(OUT_DIR / 'veg_recipe1m_medium.json', orient='records', force_ascii=False)
print(f"✓ Saved JSON subsets to {OUT_DIR}")


# =============================================================
# 7. Quick EDA helper
# =============================================================

def top_ingredients(df_subset: pd.DataFrame, n: int = 30):
    cnt = Counter()
    for toks in df_subset['tokens']:
        cnt.update(toks)
    return pd.Series(cnt).sort_values(ascending=False).head(n)

print("Top ingredients (high‑confidence set):")
print(top_ingredients(df_high))


✓ Saved JSON subsets to /content/veg_recipe1m
Top ingredients (high‑confidence set):
1              2026776
cup            1225333
2               972128
teaspoon        671851
12              544472
cups            538426
1/2             511233
tablespoons     441493
sugar           403321
salt            354982
3               333507
chopped         327196
or              292041
14              267614
butter          265003
1/4             257257
4               245411
tablespoon      242720
teaspoons       235091
and             234649
pepper          227194
oil             224718
flour           222616
fresh           218938
cheese          182271
ounce           179212
ounces          175728
ground          167009
cream           163190
vanilla         156118
dtype: int64
