In [77]:
!pip install evaluate rouge-score bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [78]:
# === STEP 1: Mount & Config ===
# Colab 以外なら try/except でスキップされます
try:
    from google.colab import drive
    drive.mount("/content/drive")
except Exception:
    pass

# --- PizzaCommonSense 未使用データ（MyDrive直下の val/*.txt） ---
PC_DATA_DIR = "/content/drive/MyDrive/val"   # ← あなたのパスに合わせて

# --- 既存の評価CSV（“Computed/Completed CSV” など） ---
CSV_IN  = "/content/drive/MyDrive/Completed_CSV/gpt4.1_predictions_full.csv"  # 例
CSV_OUT = CSV_IN.replace(".csv", ".filtered.csv")

# --- 判定に使うカラム（厳しめなら ["input"] 推奨） ---
TARGET_COLS = ["input"]

# --- 食材候補の厳しさ（頻度しきい値）大きいほど厳しい ---
FOOD_MIN_COUNT = 12   # 例: 8→やや緩め, 12→中間, 18/25→厳しめ

# --- デバッグ表示件数 ---
PREVIEW_N = 30


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [79]:
# === STEP 2: Imports & Utils ===
import os, re, json, glob
from collections import Counter
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from IPython.display import display
# === PATCH: import bert_score via evaluate ===
import evaluate

# evaluate ライブラリから bert_score をロード
bertscore = evaluate.load("bertscore")


def _extract_items(s: str):
    """'(a; b_c; NA)' などから [a,b_c,na] を抽出（小文字・英字と_）。"""
    if not s:
        return []
    return re.findall(r"[a-z_]+", str(s).lower())

def _extract_parts(s: str):
    """トークンを '_' で分解したパーツ列を返す。"""
    toks = _extract_items(s)
    parts = []
    for t in toks:
        parts += t.split("_")
    return parts


Downloading builder script: 0.00B [00:00, ?B/s]

In [80]:
# === STEP 3: Non-food dictionaries & tool detector (boundary-safe) ===

# 道具の“単語一致”と、複合語の“完全一致”
TOOL_WORDS = {
    "bowl","pan","pot","sheet","tray","spatula","knife","spoon","fork",
    "skillet","microwave","grill","freezer","fridge","refrigerator","lid",
    "rack","glass","plate","board","blender","mixer","saucepan","whisk",
    "colander","peeler","foil","strainer","sieve","tongs","ladle","rolling_pin"
}
EXACT_TOOL_COMPOUNDS = {
    "baking_sheet","baking_pan","cutting_board","measuring_cup",
    "measuring_spoon","parchment_paper","plastic_wrap","aluminum_foil",
    "paper_towel","paper_towels"
}

# 一般語（アンカー含む）
SEED_GENERAL = {"food","foods","ingredient","ingredients","item","items","stuff","mixture","mixtures"}
GENERAL_ANCHORS = {"food","ingredient","ingredients","item","items"}

# 計量・英語ストップ・調理用語（非食材扱い）
MEASURE_WORDS = {
    "cup","cups","tbsp","tablespoon","tablespoons","tsp","teaspoon","teaspoons",
    "oz","ounce","ounces","lb","pound","pounds","kg","g","gram","grams","ml","l",
    "liter","liters","degree","degrees"
}
EN_STOPWORDS = {
    "a","an","the","and","or","of","to","with","in","on","for","by","from",
    "into","as","at","over","under","than","then","so","that","this","these","those"
}
COOKING_TERMS = {
    "mix","mixed","mixing","stir","stirred","stirring","saute","sauteed","boil","boiled",
    "bake","baked","fry","fried","frying","grill","grilled","roast","roasted","toast","toasted",
    "minced","chopped","sliced","diced","ground","beaten","whisked","seasoned","coated"
}
# 状態/形容（非食材扱い）
STATE_WORDS = {
    "partially","lightly","heavily","fully","slightly","evenly","gently","well",
    "finely","coarsely","thinly","thickly","roughly",
    "fresh","frozen","raw","cooked","uncooked",
    "browned","brown","golden","tender","soft","softened","crispy","crisp","creamy","smooth",
    "hot","warm","cold","cool","room","temperature","room_temperature"
}

# 常に KEEP したい代表食材（救済）
SAFE_INGREDIENTS = {
    "salt","pepper","sugar","water","oil","butter","flour","egg","eggs",
    "milk","cream","cheese","onion","garlic","tomato","tomatoes",
    "potato","potatoes","chicken","beef","pork","fish","shrimp","rice","pasta","bread","yeast"
}

def _looks_tool_like(tok: str) -> bool:
    """道具判定：_で分割した語の“完全一致”のみ。部分一致はしない。"""
    if tok in EXACT_TOOL_COMPOUNDS:
        return True
    parts = tok.split("_")
    return any(p in TOOL_WORDS for p in parts)



In [81]:
# === STEP 4: Scan PizzaCommonSense val/*.txt and count 'input' tokens ===
def iter_pc_steps_from_val(dir_path, limit_files=None, limit_steps=None):
    """
    val フォルダの .txt(JSON) を走査し、table 配下のステップ dict を yield。
    各ステップは少なくとも 'input' を持つ想定。
    """
    files = sorted(glob.glob(os.path.join(dir_path, "*.txt")))
    if limit_files:
        files = files[:limit_files]

    n_steps = 0
    for fp in tqdm(files, desc="Scanning val/*.txt"):
        obj = None
        # JSON として読み込み（BOM/1行JSONにもフォールバック）
        for enc in ("utf-8","utf-8-sig"):
            try:
                with open(fp, "r", encoding=enc) as f:
                    txt = f.read().strip()
                obj = json.loads(txt)
                break
            except Exception:
                obj = None
        if obj is None:
            continue

        table = obj.get("table") or obj.get("instructions") or []
        if not isinstance(table, list):
            continue

        for step in table:
            if isinstance(step, dict) and "input" in step:
                yield step
                n_steps += 1
                if limit_steps and n_steps >= limit_steps:
                    return

def collect_token_freq_from_val(dir_path, limit_files=None, limit_steps=None):
    tok_count = Counter()
    total = 0
    for step in iter_pc_steps_from_val(dir_path, limit_files=limit_files, limit_steps=limit_steps):
        items = set(_extract_items(step.get("input","")))
        for t in items:
            tok_count[t] += 1
        total += 1
    df_like = tok_count.most_common()
    print(f"[val] steps scanned={total:,}, uniq_tokens={len(tok_count):,}")
    return df_like

df_like_pc = collect_token_freq_from_val(PC_DATA_DIR)
len(df_like_pc), df_like_pc[:10]


Scanning val/*.txt:   0%|          | 0/84 [00:00<?, ?it/s]

[val] steps scanned=1,225, uniq_tokens=709


(709,
 [('dough', 237),
  ('pizza', 158),
  ('and', 136),
  ('mixture', 121),
  ('na', 108),
  ('crust', 95),
  ('with', 91),
  ('cheese', 79),
  ('oil', 77),
  ('salt', 76)])

In [82]:
# === STEP 5: Build FOOD_LIKELY (freq-based) ===
def build_food_likely(df_like, min_count=12, top_k=10000):
    """
    - 出現頻度が一定以上の語から、非食材群に該当しない“食材候補”を作る。
    - 複合語 a_b は a, b それぞれを評価（食材っぽい部分語を拾う）。
    """
    NON_FOOD_BIG = (SEED_GENERAL | MEASURE_WORDS | EN_STOPWORDS |
                    COOKING_TERMS | STATE_WORDS | TOOL_WORDS | EXACT_TOOL_COMPOUNDS)
    cand_counts = {}

    for tok, c in df_like[:top_k]:
        if c < min_count or tok == "na":
            continue
        parts = tok.split("_")
        for p in parts:
            if (p in NON_FOOD_BIG) or (len(p) <= 1):
                continue
            if not re.fullmatch(r"[a-z]+", p):
                continue
            cand_counts[p] = cand_counts.get(p, 0) + c

    FOOD_LIKELY = {w for w, cnt in cand_counts.items() if (cnt >= min_count and len(w) >= 3)}
    return FOOD_LIKELY

FOOD_LIKELY = build_food_likely(df_like_pc, min_count=FOOD_MIN_COUNT, top_k=10000)
print("[food_likely] size:", len(FOOD_LIKELY))
print("sample:", sorted(list(FOOD_LIKELY))[:30])


[food_likely] size: 47
sample: ['added', 'ball', 'basil', 'beef', 'cheese', 'chicken', 'covered', 'crescent', 'crust', 'dough', 'dry', 'egg', 'flour', 'garlic', 'honey', 'kneaded', 'lamb', 'meat', 'milk', 'mozzarella', 'oil', 'olive', 'onion', 'optional', 'oregano', 'pasta', 'pepper', 'pepperoni', 'pizza', 'potato']


In [83]:

# === STEP 5 (optional): manual tweaks ===
# 例: 誤って non-food に入ってしまった食材を戻す
# non_food_terms_pc.discard("chocolate")

# 例: 一般語に追加したい語
# general_terms_pc.add("mixture")

In [84]:
# === STEP 6: Strict v2 filter with general-anchor rule ===
def is_text_excludable_strict_v2(
    text: str,
    FOOD_LIKELY: set,
    general_terms: set = SEED_GENERAL,
    non_food_terms: set = (TOOL_WORDS | EXACT_TOOL_COMPOUNDS),
    extra_nonfood: set = (MEASURE_WORDS | EN_STOPWORDS | COOKING_TERMS | STATE_WORDS),
    safe_ingredients: set = SAFE_INGREDIENTS,
) -> bool:
    """
    厳しめ版:
      - 'NA' 単独 → KEEP
      - SAFE_INGREDIENTS or FOOD_LIKELY が1語でも含まれたら → KEEP
      - ★ generalアンカー（food, ingredient, item...）があり、
        その他が非食材語のみ → DROP（"partially cooked food" など）
      - 上記以外で“食材候補ゼロ”なら → DROP
    """
    if text is None:
        return True
    t = str(text).strip().lower()
    if t.strip("() ") == "na":
        return False

    parts = _extract_parts(t)
    if not parts:
        return True

    # KEEP 条件（強め）
    if any(p in safe_ingredients for p in parts):
        return False
    if any(p in FOOD_LIKELY for p in parts):
        return False

    NON_FOOD_BIG = (general_terms | non_food_terms | extra_nonfood)

    # general アンカーが含まれ、他がすべて非食材なら DROP
    if any(p in GENERAL_ANCHORS for p in parts):
        others = [p for p in parts if p not in GENERAL_ANCHORS]
        if not others:  # "food" しかない等
            return True
        if all(p in NON_FOOD_BIG for p in others):
            return True

    # 食材候補が無ければ DROP（未知語のみ等も落とす）
    return True

def filter_csv_rows_strict_v2(df: pd.DataFrame, target_cols, FOOD_LIKELY):
    def row_excludable(row):
        flags = []
        for col in target_cols:
            val = row.get(col, "")
            flags.append(is_text_excludable_strict_v2(val, FOOD_LIKELY))
        # どれか1つでも DROP なら行を落とす（厳しめ）
        return any(flags) if flags else False

    mask = df.apply(row_excludable, axis=1)
    kept = df.loc[~mask].copy()
    print(f"[filter-v2] total={len(df):,}  kept={len(kept):,}  dropped={int(mask.sum()):,}")
    return kept


In [85]:
# === STEP 7: Apply to CSV & save ===
df = pd.read_csv(CSV_IN)
print("[csv] loaded:", CSV_IN, "shape=", df.shape)

df_filt = filter_csv_rows_strict_v2(df, TARGET_COLS, FOOD_LIKELY)
df_filt.to_csv(CSV_OUT, index=False)
print("[csv] saved:", CSV_OUT, "shape=", df_filt.shape)

display(df_filt.head(10))


[csv] loaded: /content/drive/MyDrive/Completed_CSV/gpt4.1_predictions_full.csv shape= (3069, 7)
[filter-v2] total=3,069  kept=2,316  dropped=753
[csv] saved: /content/drive/MyDrive/Completed_CSV/gpt4.1_predictions_full.filtered.csv shape= (2316, 7)


Unnamed: 0,instructions,actions,input,output,pred_input,pred_output,response
1,heat the oil in a large non stick frying pan,heat,oil,heated_oil,oil,heated oil,Input: oil\nOutput: heated oil
2,"add the onion , pepper and zucchini",add,(onion; pepper; zucchini; heated_oil),"onion, pepper and zucchini added to heated oil",onion;pepper;zucchini,onion;pepper;zucchini (added),Input: onion;pepper;zucchini\nOutput: onion;pe...
3,saute over a medium heat for 4 5mins .,saute,vegetables in heated_oil,sauteed vegetable mixture,,,Input: NA\nOutput: NA
4,add the herbs,add,(herbs; sauteed vegetable mixture),herbs added to sauteed vegetable mixture,herbs; dish being prepared,dish with herbs added,Input: herbs; dish being prepared\nOutput: dis...
5,season with black_pepper .,season,(seasoned sauteed vegetables; black pepper),seasoned sauteed vegetables,black_pepper,seasoned_black_pepper,Input: black_pepper\nOutput: seasoned_black_pe...
8,place the pizza bases on a baking tray,place,pizza base,pizza base,pizza bases,pizza bases on baking tray,Input: pizza bases\nOutput: pizza bases on bak...
9,spread half the fresh salsa over each base .,spread,(pizza base; salsa),salsa spread over pizza base,fresh salsa;base,base topped with fresh salsa,Input: fresh salsa;base\nOutput: base topped w...
10,scatter the vegetables over the pizza,scatter,(cooked vegetables; pizza base with salsa),cooked vegetables scattered on pizza base with...,vegetables; pizza,pizza topped with vegetables,Input: vegetables; pizza\nOutput: pizza topped...
11,crumble over the goat_cheese .,crumble,(goat_cheese; pizza base with salsa and cooked...,goat_cheese crumbled on pizza base with salsa ...,goat_cheese,crumbled goat_cheese,Input: goat_cheese\nOutput: crumbled goat_cheese
12,season with black_pepper,season,(black pepper; uncooked vegetable_and_goat_che...,uncooked vegetable_and_goat_cheese_pizza seaso...,black_pepper;food_to_be_seasoned,seasoned_food,Input: black_pepper;food_to_be_seasoned\nOutpu...


In [86]:
# ⛏️ PATCH 3: stricter exclusion rule
def _extract_items(s: str):
    return re.findall(r"[a-z_]+", (s or "").lower())

def is_text_excludable_strict(text: str,
                              general_terms=SEED_GENERAL,
                              non_food_terms=(TOOL_WORDS | EXACT_TOOL_COMPOUNDS),
                              food_likely=FOOD_LIKELY) -> bool:
    """
    厳しめルール:
      - 'NA' 単独は KEEP
      - トークンなし → DROP
      - _ で分割した部分語のどれかが FOOD_LIKELY にあれば KEEP
      - それ以外で、全トークンが (general ∪ non-food ∪ measure ∪ stopword ∪ cooking) に含まれる → DROP
      - 上記どれでもない未知語のみ → DROP（※ 以前より厳しい）
    """
    if text is None:
        return True
    t = str(text).strip().lower()
    if t.strip("() ") == "na":
        return False

    toks = _extract_items(t)
    if not toks:
        return True

    # ★ 1) 食材候補（頻度ベース）に該当があれば KEEP
    parts = set()
    for tok in toks:
        parts |= set(tok.split("_"))
    if any(p in food_likely for p in parts):
        return False

    NON_FOOD_BIG = (general_terms | non_food_terms | MEASURE_WORDS | EN_STOPWORDS | COOKING_TERMS)

    # ★ 2) 全トークンが非食材セットに含まれるなら DROP
    if all((tok in NON_FOOD_BIG) or all(p in NON_FOOD_BIG for p in tok.split("_")) for tok in toks):
        return True

    # ★ 3) それ以外の未知語のみ → DROP（= 厳格化）
    return True

def filter_csv_rows_strict(df, target_cols):
    def row_excludable(row):
        flags = []
        for col in target_cols:
            val = row.get(col, "")
            flags.append(is_text_excludable_strict(val))
        # すべての対象カラムで DROP 判定なら落とす
        return all(flags) if flags else False

    mask = df.apply(row_excludable, axis=1)
    kept = df.loc[~mask].copy()
    print(f"[filter-strict] total={len(df):,}  kept={len(kept):,}  dropped={int(mask.sum()):,}")
    return kept


In [87]:
# 必要ライブラリのインストール（足りない時だけ）
import sys, importlib

def _pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-U"] + pkgs
    print("Installing:", " ".join(pkgs))
    import subprocess, sys as _sys
    subprocess.check_call(cmd)

# evaluate が無ければ入れる（rouge-score も必要）
if importlib.util.find_spec("evaluate") is None:
    _pip_install(["evaluate", "rouge-score"])

# インポート & rougeハンドラ作成
import evaluate
rouge = evaluate.load("rouge")

# （任意）動作チェック
# _ = rouge.compute(predictions=["hello world"], references=["hello world"])
# print("ROUGE ready!")

In [88]:
def bert_score_f1(preds, refs):
    """BERTScoreを計算（空欄は"empty"に置き換える）"""

    # 空の予測/正解を"empty"という文字列に置き換える
    clean_preds = [p if p else "empty" for p in preds]
    clean_refs = [r if r else "empty" for r in refs]

    # フィルタリングされたデータでBERTScoreを計算
    results = bertscore.compute(predictions=clean_preds, references=clean_refs, lang="en")

    return np.mean(results['f1'])


def ema(preds, refs):
    """Exact Match Accuracy (完全一致率) を計算する関数"""
    correct_count = 0
    for p, r in zip(preds, refs):
        if p == r:
            correct_count += 1
    return correct_count / len(preds) if len(preds) > 0 else 0

def rouge_l(preds, refs):
    """前のセルでロードしたrougeオブジェクトを使ってRouge-Lスコアを計算する関数"""
    # evaluateライブラリはスコアを辞書形式で返す
    results = rouge.compute(predictions=preds, references=refs)
    return results['rougeL']

# --- 正規化（ingredient_cleaner 使用） -----------------
def canon(txt:str)->str:
    return normalise(txt)

def _to_set(txt):
    return {t.strip() for t in re.split(r"[;,]", txt) if t.strip()}

def set_f1(preds, refs):
    f1=[]
    for p,r in zip(preds, refs):
        P,R = _to_set(canon(p)), _to_set(canon(r))
        if not P and not R: f1.append(1);  continue
        if not P or not R:  f1.append(0);  continue
        inter = len(P & R)
        prec  = inter/len(P);  rec = inter/len(R)
        f1.append(0 if prec+rec==0 else 2*prec*rec/(prec+rec))
    return np.mean(f1)

# --- 非食材ペナルティ -----------------------------
BLACK = {"oven","bowl","pan","tray","sheet","mixer",
         "fork","knife","spoon","plate","°f","°c","minutes","timer"}
def non_food(preds):
    return np.mean([
        1 if any(tok in BLACK for tok in re.findall(r"\w+", p.lower())) else 0
        for p in preds
    ])  # 1=悪, 0=良

# --- ネットワーク距離 (任意) ----------------------
def graph_distance_score(preds, refs):
    if G is None: return np.nan
    dists=[]
    for p,r in zip(preds, refs):
        P,R = _to_set(canon(p)), _to_set(canon(r))
        if not P or not R: continue
        # 平均最短経路長 (5=遠い / 未接続)
        sub=[]
        for a in P:
            for b in R:
                try: sub.append(nx.shortest_path_length(G,a,b))
                except: sub.append(5)
        dists.append(np.mean(sub))
    return np.mean(dists) if dists else np.nan  # 低い程近い

# --- Embedding 類似 (任意) ------------------------
def w2v_sim(preds, refs):
    if w2v is None: return np.nan
    sims=[]
    for p,r in zip(preds, refs):
        P,R = [t for t in _to_set(canon(p)) if t in w2v], \
              [t for t in _to_set(canon(r)) if t in w2v]
        if not P or not R: continue
        sims.append(np.mean([w2v.similarity(a,b) for a in P for b in R]))
    return np.mean(sims) if sims else np.nan



In [89]:
# --- Cell 6: `evaluate`関数と`StrictScore`の計算式を更新 ---

def evaluate(df):
    p_in, r_in  = df.pred_input.fillna(""),  df.input.fillna("")
    p_out,r_out = df.pred_output.fillna(""), df.output.fillna("")

    metrics = {
        "EMA_in"      : ema(p_in, r_in),
        "EMA_out"     : ema(p_out, r_out),
        "RougeL_in"   : rouge_l(p_in, r_in),
        "RougeL_out"  : rouge_l(p_out, r_out),
        "SetF1_in"    : set_f1(p_in, r_in),
        "NonFood_in"  : non_food(p_in),
        # BERTScoreを追加
        "BERTScore_F1": bert_score_f1(p_out, r_out),
    }

    # === 新しいStrictScoreの計算式 (BERTScore中心) ===
    # セマンティックな類似度を測るBERTScoreの重みを最大にする
    metrics["StrictScore"] = (
        0.70 * metrics["BERTScore_F1"]  # BERTScoreの重みを70%に
      + 0.15 * metrics["SetF1_in"]       # SetF1の重みを15%に
      + 0.10 * metrics["RougeL_in"]      # RougeLの重みを10%に
      - 0.05 * metrics["NonFood_in"]     # NonFoodペナルティは少し残す
    )
    # EMAは厳しすぎるので計算式からは除外

    return metrics


In [90]:
# === PATCH: define `normalise` used by `canon` ===
import re

def normalise(txt):
    """
    軽量クリーナー：
    - 小文字化
    - 記号の簡易正規化（/,- をスペース扱い）
    - 英数字とアンダースコア以外はスペースに
    - 連続空白の縮約
    ※ アンダースコアは保持（red_onion 等）
    """
    if txt is None:
        return ""
    s = str(txt).lower()
    s = s.replace("&", " and ")
    s = s.replace("/", " ").replace("-", " ")
    s = re.sub(r"[^\w\s]", " ", s)     # 文字・数字・_ 以外をスペースに
    s = re.sub(r"\s+", " ", s).strip()
    return s

# もし `canon` が別セルにあるなら、そこはこのままでOK：
# def canon(txt: str) -> str:
#     return normalise(txt)


In [91]:
res = evaluate(df)

print("====== Evaluation Summary ======")
for k,v in res.items():
    print(f"{k:15}: {v:.3f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EMA_in         : 0.176
EMA_out        : 0.123
RougeL_in      : 0.339
RougeL_out     : 0.297
SetF1_in       : 0.218
NonFood_in     : 0.006
BERTScore_F1   : 0.869
StrictScore    : 0.675


In [92]:
# === Save evaluation results to JSON ===
import json, os

# where to save
OUT_JSON = os.path.splitext(CSV_OUT)[0] + "_eval.json"

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(res, f, indent=2, ensure_ascii=False)

print(f"[saved] evaluation results -> {OUT_JSON}")



[saved] evaluation results -> /content/drive/MyDrive/Completed_CSV/gpt4.1_predictions_full.filtered_eval.json
