In [91]:
# === セル1: Google Drive のマウント & ファイルパス設定 ===
from google.colab import drive
# 初回のみ実行後、ブラウザの認証に従ってログインしてください
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [92]:
# txt ファイルは、Google Drive の下記パスに配置してください。
# 例: /MyDrive/pizza_recipes/artichoke_pizza.txt
FILE_PATH = '/content/drive/MyDrive/pizza_recipes/artichoke_pizza.txt'
print(f"Reading recipe from: {FILE_PATH}")

Reading recipe from: /content/drive/MyDrive/pizza_recipes/artichoke_pizza.txt


In [93]:

# === セル2: ライブラリ読み込み & OpenAI クライアント初期化 ===
import json
import re
from getpass import getpass
from openai import OpenAI

# 安全のため、Notebook 実行時にキーを入力する方式を推奨します
api_key = getpass('Enter your OpenAI API key: ')
client = OpenAI(api_key=api_key)

Enter your OpenAI API key: ··········


In [94]:
# === セル3: レシピデータ読み込み ===
with open(FILE_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

print("Title:", data.get("title"))
print("Number of steps:", len(data.get("table", [])))

Title: artichoke_pizza
Number of steps: 4


In [95]:
def cot_prompt(instruction, action, prev_out="NA"):
    # Few-shot 例を先頭に
    shot = """
Example:
Instruction: mix flour and water
Action: mix
<in>: (flour; water)
<out>: dough

Instruction: spread tomato_sauce on dough
Action: spread
<in>: (dough; tomato_sauce)
<out>: sauced dough

"""
    prev = f"Previous output: {prev_out}\n" if prev_out!="NA" else ""
    prompt = (
        shot +
        f"Instruction: {instruction}\n"
        f"Action: {action}\n"
        f"{prev}"
        "Please think step by step, but OUTPUT ONLY TWO NOUN PHRASES:\n"
        "  – <in>: a parenthesized semicolon-separated list of ingredients/intermediates\n"
        "  – <out>: a single noun phrase (no verbs, no full sentences)\n"
        "If this step does not involve any food items, output <in>: NA and <out>: NA.\n\n"
        "<in>:"
    )
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo", temperature=0.0, max_tokens=150,
        messages=[
            {"role":"system","content":"You are a concise cooking assistant."},
            {"role":"user","content":prompt}
        ]
    )
    return resp.choices[0].message.content


In [96]:
# === セル5: 全ステップに適用して結果取得 & 表示 ===
results = []
for i, row in enumerate(data.get("table", []), start=1):
    inst = row.get("instructions")
    act  = row.get("actions")
    print(f"\n=== Step {i}: {inst} ===")
    cot_output = run_cot_step(inst, act)
    print(cot_output)
    results.append({"step": i, "cot_output": cot_output})


=== Step 1: heat up your oven as hot as it gets . ===
<in>: NA</in>
<out>: NA</out>

=== Step 2: spread tomato_sauce evenly to the both pizzas . ===
<in>: dough; tomato_sauce</in>
<out>: sauced dough</out>

=== Step 3: spread the mushrooms , artichokes and olives on the pizza . ===
<in>: mushrooms; artichokes; olives</in>
<out>: topped dough</out>

=== Step 4: bake in the oven for about 10 minutes or longer , depending on how hot your oven is . ===
<in>: topped dough</in>
<out>: baked pizza</out>


In [97]:
# === セル6: <in> / <out> の自動抽出（正規表現更新） ===
# クローズタグ </in> または誤記ケース <\in> に対応
print("\n=== Parsed <in> and <out> from CoT outputs ===")
for r in results:
    text = r['cot_output']
    # <in>タグの抽出: </in> or <\in>
    in_m  = re.search(r"<in>\s*(.*?)\s*(?:</in>|<\\in>)", text)
    out_m = re.search(r"<out>\s*(.*?)\s*(?:</out>|<\\out>)", text)
    inp   = in_m.group(1).strip() if in_m else ''
    outp  = out_m.group(1).strip() if out_m else ''
    print(f"Step {r['step']}  Input: {inp}\n           Output: {outp}\n")


=== Parsed <in> and <out> from CoT outputs ===
Step 1  Input: : NA
           Output: : NA

Step 2  Input: : dough; tomato_sauce
           Output: : sauced dough

Step 3  Input: : mushrooms; artichokes; olives
           Output: : topped dough

Step 4  Input: : topped dough
           Output: : baked pizza



In [98]:
!pip install bert-score



In [99]:
# === Cell 10: Evaluation Metrics（修正版） ===
# !pip install bert-score   # 初回のみ

from bert_score import score
import numpy as np

# gold と pred のリストを準備
gold_ins  = [step['input']  for step in data['table']]
gold_outs = [step['output'] for step in data['table']]

# corr_in があれば使い、なければ pred_in を使う
pred_ins  = [r.get('corr_in', r.get('pred_in', 'NA'))  for r in results]
pred_outs = [r.get('corr_out', r.get('pred_out','NA')) for r in results]

# 1) Exact Match Accuracy
acc_in  = np.mean([g==p for g,p in zip(gold_ins,  pred_ins)])
acc_out = np.mean([g==p for g,p in zip(gold_outs, pred_outs)])
print(f"Exact Match Input:  {acc_in:.2%}")
print(f"Exact Match Output: {acc_out:.2%}")

# 2) Token-level F1
def token_f1(gold, pred):
    gset = set(gold.split(';')) if gold!='NA' else set()
    pset = set(pred.split(';')) if pred!='NA' else set()
    if not gset and not pset: return 1.0
    tp = len(gset & pset)
    prec = tp/len(pset) if pset else 0.0
    rec  = tp/len(gset) if gset else 0.0
    return 2*prec*rec/(prec+rec+1e-12)

f1_in  = np.mean([token_f1(g,p) for g,p in zip(gold_ins,  pred_ins)])
f1_out = np.mean([token_f1(g,p) for g,p in zip(gold_outs, pred_outs)])
print(f"Token F1 Input:  {f1_in:.2%}")
print(f"Token F1 Output: {f1_out:.2%}")

# 3) BERTScore (Output だけで OK)
P, R, F1 = score(pred_outs, gold_outs, lang='en', rescale_with_baseline=True)
print(f"BERTScore Output F1: {F1.mean():.2%}")



Exact Match Input:  25.00%
Exact Match Output: 25.00%
Token F1 Input:  25.00%
Token F1 Output: 25.00%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Output F1: 15.03%


In [100]:
# === Cell A1: Install & login via CLI ===
!pip install huggingface-hub
!huggingface-cli login

# When prompted, paste your HF access token.



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [101]:
# === Cell A3: Direct model loading ===
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model     = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

inputs = tokenizer("Hello!", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello! I'm interested in the [insert location] location of [insert store type, e.g. Starbucks


In [102]:
# === Cell 12′: CoT helper with richer Few-Shot examples ===
import torch, re

device = model.device

def cot_hf_direct(instruction, action, prev_out="NA"):
    # 1) 豊富な Few-Shot を先頭に
    shot = """
Example 1:
Instruction: preheat oven to 350°F
Action: preheat
<in>: NA
<out>: NA

Example 2:
Instruction: mix flour and water
Action: mix
<in>: (flour; water)
<out>: dough

Example 3:
Instruction: spread tomato_sauce on dough
Action: spread
<in>: (dough; tomato_sauce)
<out>: sauced dough

Example 4:
Instruction: spread mushrooms, artichokes and olives on sauced dough
Action: spread
<in>: (dough; tomato_sauce; mushrooms; artichokes; olives)
<out>: topped dough

Example 5:
Instruction: bake topped dough at 220°C for 10 minutes
Action: bake
<in>: (topped dough)
<out>: baked pizza

"""

    # 2) 実際のプロンプト生成
    prompt = shot
    prompt += f"Now, follow this pattern.\n\nInstruction: {instruction}\n"
    prompt += f"Action: {action}\n"
    if prev_out != "NA":
        prompt += f"Previous output: {prev_out}\n"
    prompt += (
        "Think step by step, but OUTPUT ONLY two noun phrases:\n"
        "  – <in>: (...)  \n"
        "  – <out>: noun_phrase\n\n"
        "<in>:"
    )

    prompt += (     "Think step by step, but OUTPUT EXACTLY one <in> and one <out> pair, "
      "and then STOP. Do not generate anything beyond the <out> line.\n\n"
     "<in>:"   )

    # 3) Tokenize
    inputs = tokenizer(prompt, return_tensors="pt",
                       padding=False, truncation=False).to(device)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    # 4) Generate
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    # 5) 生成部分だけデコード
    gen = outputs[0][input_ids.size(1):]
    text = tokenizer.decode(gen, skip_special_tokens=True)
    return text.strip()


In [103]:
# === Cell 13′: Run CoT with the direct‐model helper ===
results_hf = []
prev = "NA"

for i, row in enumerate(data["table"], 1):
    instr = row["instructions"]
    act   = row.get("actions", row.get("action"))
    cot_out = cot_hf_direct(instr, act, prev)
    # parse
    ins = re.findall(r"<in>\s*(.*?)\s*</in>",  raw)
    outs= re.findall(r"<out>\s*(.*?)\s*</out>", raw)
    pred_in  = ins[0].strip()  if ins  else "NA"
    pred_out = outs[0].strip() if outs else "NA"
    results_hf.append({"pred_in":pred_in, "pred_out":pred_out})
    prev = pred_out
    print(f"Step {i}: in=({pred_in})  out=({pred_out})")


Step 1: in=(NA)  out=(NA)
Step 2: in=(NA)  out=(NA)
Step 3: in=(NA)  out=(NA)
Step 4: in=(NA)  out=(NA)


In [104]:
# === Cell 13_debug: Raw CoT 出力のデバッグ確認 ===
# ※cot_hf_direct が定義されている Cell 12′ の次に実行してください

import re

# Step 1 のみ raw 出力を確認

instr = data["table"][0]["instructions"]
act   = data["table"][0].get("actions", data["table"][0].get("action"))
raw = cot_hf_direct(instr, act, prev_out="NA")

print("--- RAW OUTPUT for Step 1 ---")
print(raw)

instr = data["table"][1]["instructions"]
act   = data["table"][1].get("actions", data["table"][1].get("action"))
raw = cot_hf_direct(instr, act, prev_out="NA")

print("--- RAW OUTPUT for Step 2 ---")
print(raw)

instr = data["table"][2]["instructions"]
act   = data["table"][2].get("actions", data["table"][2].get("action"))
raw = cot_hf_direct(instr, act, prev_out="NA")

print("--- RAW OUTPUT for Step 3 ---")
print(raw)

instr = data["table"][3]["instructions"]
act   = data["table"][3].get("actions", data["table"][3].get("action"))
raw = cot_hf_direct(instr, act, prev_out="NA")

print("--- RAW OUTPUT for Step 4 ---")
print(raw)

--- RAW OUTPUT for Step 1 ---
oven
<out>: hot oven

<in>: hot oven
<out>: extremely hot oven

<in>: extremely hot oven
<out>: scorching oven

<in>: scorching oven
<out>: blazing oven

<in>: blazing oven
<out>: fiery oven

<in>: fiery oven
<out>: extremely hot oven

<in>: extremely hot oven
<out>: oven

<in>: oven
<out>: hot oven

<in
--- RAW OUTPUT for Step 2 ---
(topped_pizza; topped_pizza)
<out>: both_pizzas

The final answer is: both_pizzas
--- RAW OUTPUT for Step 3 ---
(pizza; mushrooms, artichokes and olives)
<out>: topped pizza
--- RAW OUTPUT for Step 4 ---
oven
<out>: baking oven

Note: The instruction is not to bake the oven, but to bake in the oven. Hence, the noun phrase is "baking oven". 

The instruction is not to bake the oven, but to bake in the oven. Hence, the noun phrase is "baking oven". 

The instruction is not to bake the oven, but to bake in the oven. Hence, the noun phrase is "baking oven". 

The instruction is not to bake


In [105]:
# === Cell 14: Evaluate HF vs GPT-4.1 (and any other) ===

from bert_score import score
import numpy as np
import pandas as pd

# Gold の入出力
gold_ins  = [step["input"]  for step in data["table"]]
gold_outs = [step["output"] for step in data["table"]]

# 既存の GPT-4.1 結果（results 変数内の corr_*）を抽出
g4_ins  = [r.get("corr_in", r.get("pred_in", "NA"))  for r in results]
g4_outs = [r.get("corr_out", r.get("pred_out","NA")) for r in results]

# HF モデルの予測
hf_ins  = [r["pred_in"]  for r in results_hf]
hf_outs = [r["pred_out"] for r in results_hf]

def exact_match(gold, pred):
    return np.mean([g==p for g,p in zip(gold, pred)])

def token_f1(gold, pred):
    gset = set(gold.split(";")) if gold!="NA" else set()
    pset = set(pred.split(";")) if pred!="NA" else set()
    if not gset and not pset: return 1.0
    tp   = len(gset & pset)
    prec = tp/len(pset) if pset else 0.0
    rec  = tp/len(gset) if gset else 0.0
    return 2*prec*rec/(prec+rec+1e-12)

# 計算
records = []
for name, (pred_i, pred_o) in [
    ("gpt-4o", (g4_ins,  g4_outs)),
    ("llama-3.2-3B",       (hf_ins, hf_outs))
]:
    em_in  = exact_match(gold_ins,  pred_i)
    em_out = exact_match(gold_outs, pred_o)
    f1_in  = np.mean([token_f1(g,p) for g,p in zip(gold_ins,  pred_i)])
    f1_out = np.mean([token_f1(g,p) for g,p in zip(gold_outs, pred_o)])
    # BERTScore only on outputs
    P,R,F1 = score(pred_o, gold_outs, lang="en", rescale_with_baseline=True)
    records.append({
        "model":      name,
        "EM_in":      f"{em_in:.2%}",
        "EM_out":     f"{em_out:.2%}",
        "TokenF1_in": f"{f1_in:.2%}",
        "TokenF1_out":f"{f1_out:.2%}",
        "BERT_F1":    f"{F1.mean():.2%}"
    })

df_comp = pd.DataFrame(records).set_index("model")
df_comp


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0_level_0,EM_in,EM_out,TokenF1_in,TokenF1_out,BERT_F1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gpt-4o,25.00%,25.00%,25.00%,25.00%,15.03%
llama-3.2-3B,25.00%,25.00%,25.00%,25.00%,15.03%


In [106]:
# === Cell Q1: Load Qwen tokenizer & model ===
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_NAME = "Qwen/Qwen3-8B"

tokenizer_q = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)
model_q     = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Qwen も pad_token を eos_token に合わせる
tokenizer_q.pad_token    = tokenizer_q.eos_token
model_q.config.pad_token_id = tokenizer_q.eos_token_id

device_q = model_q.device
print(f"Qwen model loaded on {device_q}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Qwen model loaded on cuda:0


In [127]:
def cot_qwen_direct(instruction, action, prev_out="NA"):
    # --- 強化版 Few-Shot + 厳格フォーマット指示 ---
    few_shot = """
Example 1:
Instruction: preheat oven to 350°F
Action: preheat
<in>: NA
<out>: NA

Example 2:
Instruction: mix flour and water
Action: mix
<in>: (flour; water)
<out>: dough

Example 3:
Instruction: spread tomato_sauce on dough
Action: spread
<in>: (dough; tomato_sauce)
<out>: sauced dough

Example 4:
Instruction: spread mushrooms, artichokes and olives on sauced dough
Action: spread
<in>: (dough; tomato_sauce; mushrooms; artichokes; olives)
<out>: topped dough

Example 5:
Instruction: bake topped dough at 220°C for 10 minutes
Action: bake
<in>: (topped dough)
<out>: baked pizza
"""

    # ↓↓↓ ここをガッチリ書き換え ↓↓↓
    prompt = few_shot + "\n"
    prompt += f"Now complete **only** the next <in> and <out> lines and then STOP. Do NOT generate any other text.\n\n"
    prompt += f"Instruction: {instruction}\n"
    prompt += f"Action: {action}\n"
    if prev_out!="NA":
        prompt += f"Previous output: {prev_out}\n"
    prompt += "<in>:"  # モデルにここから始めさせる

    inputs = tokenizer_q(prompt, return_tensors="pt", padding=False, truncation=False).to(device_q)
    outputs = model_q.generate(
        **inputs,
        generation_config=GenerationConfig(
            max_new_tokens=20,   # フォーマット行だけ返せば十分
            do_sample=False,      # Greedy にして確実に一度だけ
            eos_token_id=tokenizer_q.eos_token_id,
            pad_token_id=tokenizer_q.eos_token_id,
        ),
    )
    # プロンプト部を除いた生テキスト
    gen_tokens = outputs[0][ inputs["input_ids"].shape[1] : ]
    raw_text = tokenizer_q.decode(gen_tokens, skip_special_tokens=True)
    print(raw_text)

    if "</out>" in raw_text:
        # splitして最初の部分＋閉じタグを再付加
        raw_text = raw_text.split("</out>", 1)[0] + "</out>"

    # 最初の１組だけパース
    ins = re.findall(r"<in>\s*(.*?)\s*</in>",  text)
    outs= re.findall(r"<out>\s*(.*?)\s*</out>", text)
    return (ins[0].strip() if ins else "NA",
            outs[0].strip() if outs else "NA")


In [128]:
# === Cell Q3: Run CoT on all steps with Qwen ===
results_qwen = []
prev = "NA"

for i, row in enumerate(data["table"], 1):
    instr = row["instructions"]
    act   = row.get("actions", row.get("action"))

    # Qwen で CoT 推論
    pi, po = cot_qwen_direct(instr, act, prev)
    print(pi)
    print(po)
    results_qwen.append({"pred_in":pi, "pred_out":po})

    # 次ステップへ受け渡し
    prev = po

# 確認出力
for i, r in enumerate(results_qwen, 1):
    print(f"Step {i}: in=({r['pred_in']})  out=({r['pred_out']})")


 (oven)
<out>: hot oven
Okay, let me try to figure this out. The
: topped dough
: baked pizza
 (baked pizza; tomato_sauce)
<out>: sauced pizza
Okay, let's
: topped dough
: baked pizza
 (baked pizza; mushrooms; artichokes; olives)
<out>: topped pizza

: topped dough
: baked pizza
 (baked pizza)
<out>: done pizza
Okay, let me try to figure this out
: topped dough
: baked pizza
Step 1: in=(: topped dough)  out=(: baked pizza)
Step 2: in=(: topped dough)  out=(: baked pizza)
Step 3: in=(: topped dough)  out=(: baked pizza)
Step 4: in=(: topped dough)  out=(: baked pizza)


In [120]:
# === Cell Q2_debug: Full raw-output helper for Qwen CoT ===

from transformers import GenerationConfig
import re, torch

# 1) あらかじめ一度だけ生成設定を作成しておく
qwen_gen_cfg = GenerationConfig(
    max_new_tokens=100,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
    top_k=50,
    eos_token_id=tokenizer_q.eos_token_id,
    pad_token_id=tokenizer_q.eos_token_id,
)

def cot_qwen_direct_raw(instruction, action, prev_out="NA"):
    """
    QwenモデルでCoTを実行し、rawに生成されたテキストをそのまま返します。
    デバッグ用途に使って、何が出力されているかを丸ごと確認できます。
    """
    # --- Few-Shot Examples ---
    few_shot = """
Example 1:
Instruction: preheat oven to 350°F
Action: preheat
<in>: NA
<out>: NA

Example 2:
Instruction: mix flour and water
Action: mix
<in>: (flour; water)
<out>: dough

Example 3:
Instruction: spread tomato_sauce on dough
Action: spread
<in>: (dough; tomato_sauce)
<out>: sauced dough

Example 4:
Instruction: spread mushrooms, artichokes and olives on sauced dough
Action: spread
<in>: (dough; tomato_sauce; mushrooms; artichokes; olives)
<out>: topped dough

Example 5:
Instruction: bake topped dough at 220°C for 10 minutes
Action: bake
<in>: (topped dough)
<out>: baked pizza

"""

    # --- プロンプト組み立て ---
    prompt  = few_shot
    prompt += "Now complete for this step, then STOP:\n\n"
    prompt += f"Instruction: {instruction}\n"
    prompt += f"Action: {action}\n"
    if prev_out != "NA":
        prompt += f"Previous output: {prev_out}\n"
    prompt += "\n<in>:"

    # --- トークナイズ ---
    inputs = tokenizer_q(
        prompt,
        return_tensors="pt",
        padding=False,
        truncation=False
    ).to(device_q)

    # --- 生成実行 ---
    outputs = model_q.generate(
        **inputs,
        generation_config=qwen_gen_cfg,
    )

    # --- プロンプト部分を切り落とし、生成部分だけデコード ---
    gen_tokens = outputs[0][ inputs["input_ids"].shape[1] : ]
    raw_text   = tokenizer_q.decode(gen_tokens, skip_special_tokens=False)
    cut = raw_text.split("</out>", 1)[0] + "</out>"

    return cut

# --- デバッグ例の呼び出し ---
instr = data["table"][0]["instructions"]
act   = data["table"][0].get("actions", data["table"][0].get("action"))
raw = cot_qwen_direct_raw(instr, act, prev_out="NA")
print("=== FULL RAW QWEN OUTPUT ===")
print(raw)

instr = data["table"][1]["instructions"]
act   = data["table"][1].get("actions", data["table"][0].get("action"))
raw = cot_qwen_direct_raw(instr, act, prev_out="NA")
print("=== FULL RAW QWEN OUTPUT ===")
print(raw)

instr = data["table"][2]["instructions"]
act   = data["table"][2].get("actions", data["table"][0].get("action"))
raw = cot_qwen_direct_raw(instr, act, prev_out="NA")
print("=== FULL RAW QWEN OUTPUT ===")
print(raw)

instr = data["table"][3]["instructions"]
act   = data["table"][3].get("actions", data["table"][0].get("action"))
raw = cot_qwen_direct_raw(instr, act, prev_out="NA")
print("=== FULL RAW QWEN OUTPUT ===")
print(raw)

=== FULL RAW QWEN OUTPUT ===
 (oven)
<out>: hot oven
Okay, let me try to figure this out. The user provided some examples of how to process cooking instructions into actions with inputs and outputs. Now, the current instruction is "heat up your oven as hot as it gets." 

First, I need to identify the action. The examples show that actions are usually verbs like preheat, mix, spread, bake. Here, the main verb is "heat up," so the action should be "heat."

</out>
=== FULL RAW QWEN OUTPUT ===
 (topped dough; topped dough)
<out>: ?

Okay, let's see. The user wants me to figure out the correct action and output for the instruction "spread tomato_sauce evenly to the both pizzas." 

First, looking at the previous examples. In Example 2, mixing flour and water gives dough. Then spreading tomato sauce on dough becomes sauced dough. Example 4 adds multiple toppings, resulting in topped dough. Then baking that becomes baked pizza.

But here, the instruction</out>
=== FULL RAW QWEN OUTPUT ===
 (ba

In [117]:
# === Cell Q3: Run CoT on all steps with Qwen ===
results_qwen = []
prev = "NA"

for i, row in enumerate(data["table"], 1):
    instr = row["instructions"]
    act   = row.get("actions", row.get("action"))

    # Qwen で CoT 推論
    pi, po = cot_qwen_direct_raw(instr, act, prev)
    results_qwen.append({"pred_in":pi, "pred_out":po})

    # 次ステップへ受け渡し
    prev = po

# 確認出力
for i, r in enumerate(results_qwen, 1):
    print(f"Step {i}: in=({r['pred_in']})  out=({r['pred_out']})")

ValueError: too many values to unpack (expected 2)

In [113]:
# === Cell Q3_debug: Inspect raw Qwen outputs ===
# Make sure this runs after your cot_qwen_direct definition (Cell Q2).

# Pick just the first step to sanity-check
first_step = data["table"][0]
instr = first_step["instructions"]
act   = first_step.get("actions", first_step.get("action"))

# Call the helper and capture its raw decode
raw_in, raw_out = cot_qwen_direct(instr, act, prev_out="NA")

print("=== QWEN RAW IN ===")
print(raw_in)
print("=== QWEN RAW OUT ===")
print(raw_out)

first_step = data["table"][1]
instr = first_step["instructions"]
act   = first_step.get("actions", first_step.get("action"))

# Call the helper and capture its raw decode
raw_in, raw_out = cot_qwen_direct(instr, act, prev_out="NA")

print("=== QWEN RAW IN ===")
print(raw_in)
print("=== QWEN RAW OUT ===")
print(raw_out)

first_step = data["table"][2]
instr = first_step["instructions"]
act   = first_step.get("actions", first_step.get("action"))

# Call the helper and capture its raw decode
raw_in, raw_out = cot_qwen_direct(instr, act, prev_out="NA")

print("=== QWEN RAW IN ===")
print(raw_in)
print("=== QWEN RAW OUT ===")
print(raw_out)

first_step = data["table"][3]
instr = first_step["instructions"]
act   = first_step.get("actions", first_step.get("action"))

# Call the helper and capture its raw decode
raw_in, raw_out = cot_qwen_direct(instr, act, prev_out="NA")

print("=== QWEN RAW IN ===")
print(raw_in)
print("=== QWEN RAW OUT ===")
print(raw_out)


=== QWEN RAW IN ===
NA
=== QWEN RAW OUT ===
NA
=== QWEN RAW IN ===
NA
=== QWEN RAW OUT ===
NA
=== QWEN RAW IN ===
NA
=== QWEN RAW OUT ===
NA
=== QWEN RAW IN ===
NA
=== QWEN RAW OUT ===
NA
