# 🍕 PizzaCommonSense × ReAct Mini Eval
Evaluate the **ReAct (Reason + Act)** framework on the first 50 samples of the PizzaCommonSense dataset.

*Just run the cells top-to-bottom. Replace the API key in the prompt below.*


In [25]:
# 🔧 Install latest LangChain split packages + FAISS
!pip -q install langchain langchain-openai langchain-community faiss-cpu tiktoken openai




In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
# 🔑 Environment variables
import os, getpass, json

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:')  # ←自分のキーを入力

# ▼ PizzaCommonSense データの格納パスを合わせてください
DATA_PATH = "/content/drive/MyDrive/train"  # 例: drive/MyDrive/…
N_SAMPLES = 50  # デモ用に 50 件だけ。None にすると全データ読み込み


OpenAI API key:··········


In [28]:
!ls -l /content/drive/MyDrive/train/

total 3581
-rw------- 1 root root  4194 Jul  1 13:33  10_minute_white_pizza_recipe.txt
-rw------- 1 root root  5283 Jul  1 13:33 "3_c_'s_and_a_p_pizza.txt"
-rw------- 1 root root  2510 Jul  1 13:33  4_ingredient_pizza_hot_dish.txt
-rw------- 1 root root  4873 Jul  1 13:33  4_layer_pizza_dip.txt
-rw------- 1 root root  3669 Jul  1 13:33 '``_a_bit_different_'\'''\''_breakfast_pizza.txt'
-rw------- 1 root root  4764 Jul  1 13:33  accidental_veggie_tortilla_pizza.txt
-rw------- 1 root root  2585 Jul  1 13:33  after-school_parmesan_pizza_dip.txt
-rw------- 1 root root  3634 Jul  1 13:33 "alea_'s_eggplant_pizza_bites.txt"
-rw------- 1 root root  5277 Jul  1 13:33 "alea_'s_pizza_grilled_cheese_sandwiches.txt"
-rw------- 1 root root  5589 Jul  1 13:33  alfredo_deep_dish_pizza.txt
-rw------- 1 root root  3660 Jul  1 13:33  amazing_bbq_chicken_pizza.txt
-rw------- 1 root root  5896 Jul  1 13:33  apple_dessert_pizza_recipe.txt
-rw------- 1 root root  3616 Jul  1 13:33  apple_feta_pizza.txt
-rw---

In [29]:
# 📂 Load first N_SAMPLES files
import glob, json, pandas as pd, textwrap

def load_pizzacommonsense(path, limit=None):
    files = sorted(glob.glob(f"{path}/**/*.txt", recursive=True))
    if limit:
        files = files[:limit]
    out = []
    for fp in files:
        with open(fp) as f:
            out.append(json.load(f))
    return out

dataset = load_pizzacommonsense(DATA_PATH, N_SAMPLES)
print(f"Loaded {len(dataset)} samples")
pd.DataFrame(dataset[:3])


Loaded 50 samples


Unnamed: 0,title,id,table,ingredients,recipe
0,10_minute_white_pizza_recipe,23078eb133,[{'instructions': 'preheat oven to 500 degrees...,[{'text': '1 pkt thin crust prepared pizza she...,"[{'text': 'Preheat oven to 500 degrees.'}, {'t..."
1,3_c_'s_and_a_p_pizza,a38a305a1b,[{'instructions': 'preheat oven to 400 degrees...,[{'text': '1 (11 ounce) packagepillsbury thin-...,"[{'text': 'Preheat oven to 400 degrees.'}, {'t..."
2,4_ingredient_pizza_hot_dish,661a92397f,"[{'instructions': 'brown ground beef .', 'inpu...","[{'text': '8 ounces crescent dinner rolls'}, {...","[{'text': 'Brown gound beef.'}, {'text': 'Spre..."


In [30]:
# 🔎 Build a toy knowledge index (robust to top-level "table")

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.document import Document

def extract_rows(js_obj):
    """Flatten: return a list of step-dicts with keys we need."""
    if "table" in js_obj:          # most PizzaCommonSense files
        return js_obj["table"]
    else:                          # already a single step
        return [js_obj]

def step_to_text(step):
    """Convert one step dict to 'ingredient | action -> effect' string."""
    ing  = step.get("input", step.get("ingredient", "NA")).strip("()").split(";")[0]
    act  = step.get("action", step.get("actions", "NA"))
    eff  = step.get("output", "NA").strip("()")
    return f"{ing} | {act} → {eff}"

# -- flatten all steps out of the first N_SAMPLES recipes --
rows = []
for recipe in dataset:
    rows.extend(extract_rows(recipe))

docs = [Document(page_content=step_to_text(r)) for r in rows]
vectorstore = FAISS.from_documents(docs, OpenAIEmbeddings())
retriever   = vectorstore.as_retriever(k=3)

print(f"Indexed {len(docs)} step-level triples")


Indexed 576 step-level triples


In [31]:
# 🧠 ReAct prompt helpers
import openai, re, json, time
openai.api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI()

# 🧠 ── ReAct SYSTEM prompt  ──────────────────────────────────────────
SYSTEM = """
You are a PizzaCommonSense assistant.

• Follow the ReAct format:
  Thought: (your reasoning)
  Action:  Search("…")           # optional
  Observation: (tool result)
  Repeat Thought / Action / Observation as needed.

• When you have the result, output **exactly one line**:
  Final Answer: <state_token>

<state_token> rules
1. If the instruction produces a food-state result
     – Use the snake_case token that matches the dataset
       e.g. oil_drizzled_on_pizza_shell
2. If the instruction is equipment-only (preheating, timer set, etc.)
     – Output the literal token **NA** (all caps), nothing else.
3. Do NOT add explanations or extra punctuation after the token.
"""

# 🧩 1-shot demos (keep short and clear) ──────────────────────────────
DEMO = """
Question: drizzle shell with oil
Thought: The shell is a pizza crust; drizzling creates an oiled surface.
Final Answer: oil_drizzled_on_pizza_shell

Question: preheat oven to 350 degrees
Thought: Pre-heating involves no direct food transformation.
Final Answer: NA
"""


STOP_TOKENS = ["Observation:"]
FINAL_TAG   = "Final Answer:"

DEMO = """
Question: drizzle shell with oil
Thought: I know drizzle means pour a thin stream of oil over the shell.
Final Answer: oil_drizzled_on_pizza_shell
"""


def assemble_prompt(q, scratch):
    head = SYSTEM + "\n" + DEMO + "\n"
    if scratch:
        head += "\n".join(scratch) + "\n"
    return head + f"Question: {q}\n"


def llm_chat(prompt, stop=None, temp=0.2, max_t=256):
    res = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role":"user","content":prompt}],
        temperature=temp,
        max_tokens=max_t,
        stop=stop,
    )
    return res.choices[0].message.content.strip()



In [32]:
test = client.chat.completions.create(
    model="gpt-4.1",
    messages=[{"role": "user", "content": "Say hi"}],
)
print(test.choices[0].message.content)


Hi! 😊 How can I help you today?


In [33]:
# 🚀 Run ReAct on a single question
def run_react(question, max_steps=5):
    scratch = []
    for _ in range(max_steps):
        prompt = assemble_prompt(question, scratch)
        resp   = llm_chat(prompt, stop=STOP_TOKENS)

        if FINAL_TAG in resp:
            return resp.split(FINAL_TAG)[-1].strip(), scratch + [resp]

        # --- Extract & execute Action ---
        action_match = re.search(r"Action:\s*(.+)", resp)
        if action_match:
            action = action_match.group(1)
            if action.startswith("Search("):                 # e.g. Search("sauté garlic")
                query = action[len("Search("):-1]
                docs  = retriever.get_relevant_documents(query)
                obs   = "; ".join(d.page_content for d in docs) or "No results"
            else:
                obs = "Unsupported action"
        else:
            obs = "No action"

        scratch += [resp, f"Observation: {obs}"]

    return "N/A", scratch



In [34]:
# 📊 Evaluate loop (step-level)
from tqdm import notebook
import pandas as pd

# --- ❶ まずレシピ → ステップ に展開 ----------------------------
def extract_rows(js_obj):
    return js_obj["table"] if "table" in js_obj else [js_obj]

step_rows = []
for rec in dataset:              # ← レシピ50件
    step_rows.extend(extract_rows(rec))

print("Total steps:", len(step_rows))     # 例: 50レシピ × 平均8手順

# --- ❷ ReAct を各ステップ（instruction）に実行 --------------------
results = []
for step in notebook.tqdm(step_rows):
    q = step["instructions"]                # 質問文
    gt = step["output"]                     # 正解 (effect)

    answer, trace = run_react(q)            # ReAct 実行

    results.append({
        "question":      q,
        "react_answer":  answer,
        "ground_truth":  gt,
        "trace":         "\n".join(trace)
    })

df = pd.DataFrame(results)
df.head()



Total steps: 576


  0%|          | 0/576 [00:00<?, ?it/s]

Unnamed: 0,question,react_answer,ground_truth,trace
0,preheat oven to 500 degrees .,,,Final Answer: NA
1,put the pizza_shell or possibly focaccia onto ...,pizza_shell_placed_on_pan,pizza_shell,Final Answer: pizza_shell_placed_on_pan
2,drizzle shell with oil,oil_drizzled_on_pizza_shell,oil_drizzled_on pizza_shell,Final Answer: oil_drizzled_on_pizza_shell
3,scatter chopped_garlic over shell to edges .,chopped_garlic_scattered_on_pizza_shell,chopped_garlic_scattered_on_pizza shell,Final Answer: chopped_garlic_scattered_on_pizz...
4,cover the pie with a layer of all 3 cheeses co...,cheese_layered_on_pizza,pizza_shell_covered_with_cheeses,Final Answer: cheese_layered_on_pizza


In [35]:
# 💾 Save results to CSV & provide download link
import pandas as pd, datetime, pathlib, IPython

timestamp   = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path    = f"/content/react_eval_{timestamp}.csv"

df.to_csv(csv_path, index=False)
print(f"Saved → {csv_path}")




Saved → /content/react_eval_20250802_165826.csv
