In [1]:
# --- ライブラリの入れ直し ---
!pip uninstall -y openai -q        # ← いったん削除
!pip -q install openai==0.27.2 tqdm rouge-score evaluate pandas tiktoken



In [2]:
!pip show openai | grep Version


Version: 0.27.2


In [3]:
# Import required modules
import os
import json
import time
import re
import getpass
from pathlib import Path
from typing import Dict, List, Iterator, Tuple

import pandas as pd
import numpy as np
from tqdm import tqdm
import openai
import evaluate

print("✅ All libraries imported successfully!")


✅ All libraries imported successfully!


In [4]:
# Secure API key input
print("🔑 Please enter your OpenAI API key:")
os.environ["OPENAI_API_KEY"] = getpass.getpass("API Key: ")

# Verify API key is set
if os.environ.get("OPENAI_API_KEY"):
    print("✅ API key configured successfully!")
else:
    print("❌ API key not set. Please run this cell again.")


🔑 Please enter your OpenAI API key:
API Key: ··········
✅ API key configured successfully!


In [5]:
from google.colab import drive
drive.mount('/content/drive')

!ls /content/drive/MyDrive


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'2019世界各国官製地図発表（3E17関淵之） (1).pptx'
 2019世界各国官製地図発表（3E17関淵之）.pptx
 2019世界各国官製地図発表aa（3E17関淵之）.pptx
'Activity Diagram.drawio.png'
 adult.zip
 bc_01-17.zip
 bc_18-23.zip
 bc_24-27.zip
 bc_28-33.zip
 BGMshuu
 blind_test
'Career Event by FAST OFFER.gform'
'Class Diagram.drawio.png'
'Class Diagram_White.drawio.png'
 Colab
'Colab Notebooks'
 Component_sht.drawio.png
'Coronavirus (COVID-19) records (1).pdf'
 CV_Fuchiyuki.docx
 data_weights
'Emailing Offer Letter View.pdf'
'Extracting Actionable Knowledge from Cooking Recipes: An LLM Approach to Commonsense Reasoning and Graphical RepresentationI.gdoc'
'family mart in oomiya.csv'
'File to replace '
'Fuchiyuki Seki.pdf'
 GIS発表1A18関淵之.pptx
'Google AI Studio'
 hihihi
 IMG_1826.MOV
 IMG_1827.MOV
 IMG_9927.jpeg
 japan_ver80_prefecture.shp
'Khash Bonus'
'Level_3_Week_14_Free_Lesson_copy (1).gslides'
 Level_3_Week_14_Free_Les

In [6]:
from pathlib import Path
import json
import numpy as np

class PizzaDataLoader:
    """
    Handles folder traversal and JSON parsing for PizzaCommonSense dataset.
    デフォルトの data_path を Google Drive 内の MyDrive/train フォルダに変更。
    """
    def __init__(self, data_path: str = "/content/drive/MyDrive/train"):
        self.data_path = Path(data_path)
        if not self.data_path.exists():
            raise FileNotFoundError(f"Data path {data_path} does not exist")
        print(f"✅ Using data path: {self.data_path}")

    def iter_tables(self, split: str = None) -> Iterator[dict]:
        """
        split 引数は不要 (すでに train フォルダ直下を想定)。
        フォルダ内のすべての .txt ファイルを読み込みます。
        """
        txt_files = list(self.data_path.glob("*.txt"))
        print(f"Found {len(txt_files)} recipe files in {self.data_path.name}")

        for file_path in txt_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                if 'table' in data and isinstance(data['table'], list):
                    for step in data['table']:
                        if self._validate_step(step):
                            yield step
            except Exception as e:
                print(f"⚠️  Skipping {file_path.name}: {e}")

    def _validate_step(self, step: dict) -> bool:
        required = ['instructions', 'actions', 'input', 'output']
        return all(k in step for k in required)

    def get_split_stats(self) -> dict:
        steps = list(self.iter_tables())
        return {
            'total_steps': len(steps),
            'unique_actions': len(set(s['actions'] for s in steps)),
            'avg_instruction_length': np.mean([len(s['instructions']) for s in steps])
        }


In [7]:
# ドライブマウント後に実行
data_loader = PizzaDataLoader()  # デフォルトで /content/drive/MyDrive/train を参照
stats = data_loader.get_split_stats()
print("📊 Dataset stats:", stats)

# サンプル3件を表示
for i, step in enumerate(data_loader.iter_tables()):
    print(f"\nStep {i+1}")
    print(" Instructions:", step['instructions'])
    print(" Actions:     ", step['actions'])
    print(" Input:       ", step['input'])
    print(" Output:      ", step['output'])
    if i >= 2:
        break


✅ Using data path: /content/drive/MyDrive/train
Found 744 recipe files in train
📊 Dataset stats: {'total_steps': 3069, 'unique_actions': 168, 'avg_instruction_length': np.float64(40.33919843597263)}
Found 744 recipe files in train

Step 1
 Instructions: preheat the oven to 400f .
 Actions:      preheat
 Input:        NA
 Output:       NA

Step 2
 Instructions: heat the oil in a large non stick frying pan
 Actions:      heat
 Input:        oil
 Output:       heated_oil

Step 3
 Instructions: add the onion , pepper and zucchini
 Actions:      add
 Input:        (onion; pepper; zucchini; heated_oil)
 Output:       onion, pepper and zucchini added to heated oil


In [8]:
class CoTPromptGenerator:
    """Chain‑of‑Thought prompt generator for PizzaCommonSense."""

    def __init__(self):
        # ✅ ルールを明文化
        self.system_message = (
            "You are an expert cooking‑reasoning assistant.\n"
            "For every recipe step you must predict\n"
            "  • the *input* comestibles/items that go into the action（原材料がない場合は NA）\n"
            "  • the *output* comestible/result that comes out （結果が食材でない／道具のみの場合は NA）\n\n"
            "Rules:\n"
            "1. If the step does NOT consume or transform any food, write exactly 'NA' for BOTH input and output.\n"
            "   ‑ e.g. pre‑heating an oven, washing a utensil, setting a timer.\n"
            "2. Otherwise list the food items succinctly; join multiple items with semicolons.\n"
            "3. Respond **only** with the two lines:\n"
            "     Input: <your prediction>\n"
            "     Output: <your prediction>\n"
        )

    def make_prompt(self, step: dict) -> str:
        """Create a deterministic CoT prompt for one recipe step."""
        instr  = self._clean(step["instructions"])
        action = self._clean(step["action"  ] if "action" in step else step["actions"])

        return (
            f"Instruction: {instr}\n"
            f"Action: {action}\n\n"
            # CoT を誘発
            "Let's reason step by step.\n"
            "1️⃣ Identify whether any food or edible item is being used or produced.\n"
            "2️⃣ If none, decide Input=NA and Output=NA immediately.\n"
            "3️⃣ Otherwise, list the food that goes *into* the action (Input) and "
            "the food/result that comes *out of* the action (Output).\n\n"
            "Remember the required format:\n"
            "Input: <prediction or NA>\n"
            "Output: <prediction or NA>"
        )

    # ---------- helpers ----------
    @staticmethod
    def _clean(text: str) -> str:
        text = str(text).strip()
        text = re.sub(r"\s+", " ", text)
        return text.replace("\u2012", "-")      # ‑→-

    def get_system_message(self) -> str:
        return self.system_message

    # Initialize prompt generator
        return self.system_message

# Initialize prompt generator
prompt_generator = CoTPromptGenerator()
print("✅ CoT prompt generator initialized successfully!")



✅ CoT prompt generator initialized successfully!


In [9]:
# Test prompt generation with sample data
print("🧪 Testing Prompt Generation:\n" + "="*50)
print("SYSTEM MESSAGE:")
print("="*50)
print(prompt_generator.get_system_message())

# Get a sample step for testing
sample_step = next(data_loader.iter_tables("val"))
test_prompt = prompt_generator.make_prompt(sample_step)

print("\n" + "="*50)
print("SAMPLE PROMPT:")
print("="*50)
print(test_prompt)

print("\n" + "="*50)
print("GROUND TRUTH:")
print("="*50)
print(f"Input: {sample_step['input']}")
print(f"Output: {sample_step['output']}")


🧪 Testing Prompt Generation:
SYSTEM MESSAGE:
You are an expert cooking‑reasoning assistant.
For every recipe step you must predict
  • the *input* comestibles/items that go into the action（原材料がない場合は NA）
  • the *output* comestible/result that comes out （結果が食材でない／道具のみの場合は NA）

Rules:
1. If the step does NOT consume or transform any food, write exactly 'NA' for BOTH input and output.
   ‑ e.g. pre‑heating an oven, washing a utensil, setting a timer.
2. Otherwise list the food items succinctly; join multiple items with semicolons.
3. Respond **only** with the two lines:
     Input: <your prediction>
     Output: <your prediction>

Found 744 recipe files in train

SAMPLE PROMPT:
Instruction: preheat the oven to 400f .
Action: preheat

Let's reason step by step.
1️⃣ Identify whether any food or edible item is being used or produced.
2️⃣ If none, decide Input=NA and Output=NA immediately.
3️⃣ Otherwise, list the food that goes *into* the action (Input) and the food/result that comes *out of*

In [10]:
!pip show openai | grep Version


Version: 0.27.2


In [11]:
import time, re, os, openai

# ← 0.27 系ではグローバル変数でキーを渡す
openai.api_key = os.getenv("OPENAI_API_KEY")

class GPT4Predictor:
    """
    OpenAI 0.27.x 用 GPT‑4 呼び出しクラス
    """

    def __init__(self, model: str = "gpt-4.1-mini",
                 max_retries: int = 3, base_delay: float = 0.5):
        self.model = model
        self.max_retries = max_retries
        self.base_delay  = base_delay

        # 接続テスト
        openai.ChatCompletion.create(
            model=self.model,
            messages=[{"role": "user", "content": "ping"}],
            max_tokens=1,
            temperature=0
        )
        print(f"✅ API connection successful with model: {self.model}")

    # ---------- 推論 ----------
    def predict(self, prompt: str, system_message: str) -> str:
        for i in range(self.max_retries):
            try:
                resp = openai.ChatCompletion.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": system_message},
                        {"role": "user",   "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=200
                )
                return resp.choices[0].message["content"].strip()

            except openai.error.RateLimitError:
                wait = self.base_delay * (2 ** i)
                print(f"⏳ Rate‑limit, retrying in {wait}s …")
                time.sleep(wait)

            except openai.error.OpenAIError as e:
                print(f"❌ API error on attempt {i+1}: {e}")
                time.sleep(self.base_delay)

        raise RuntimeError("Failed to get response after retries")

    # ---------- 応答パース ----------
    @staticmethod
    def parse_io(text: str) -> tuple[str, str]:
        in_pat  = r"Input\s*[:：]\s*(.+?)(?=\n|Output|$)"
        out_pat = r"Output\s*[:：]\s*(.+?)(?=\n|$)"
        inp  = re.search(in_pat,  text, re.I | re.S)
        out  = re.search(out_pat, text, re.I | re.S)
        return (inp.group(1).strip()  if inp else "",
                out.group(1).strip()  if out else "")

    def add_delay(self):
        time.sleep(self.base_delay)

# -------- テスト --------
predictor = GPT4Predictor()
print("✅ GPT4Predictor is ready!")



✅ API connection successful with model: gpt-4.1-mini
✅ GPT4Predictor is ready!


In [12]:
# Test GPT-4 prediction with sample data
print("🧪 Testing GPT-4 Prediction:")

# Use the same sample step from before
test_prompt = prompt_generator.make_prompt(sample_step)
system_msg = prompt_generator.get_system_message()

print("\n⏳ Generating prediction...")
response = predictor.predict(test_prompt, system_msg)

print("\n" + "="*50)
print("MODEL RESPONSE:")
print("="*50)
print(response)

# Parse the response
pred_input, pred_output = predictor.parse_io(response)

print("\n" + "="*50)
print("PARSED PREDICTIONS:")
print("="*50)
print(f"Predicted Input: '{pred_input}'")
print(f"Predicted Output: '{pred_output}'")

print("\n" + "="*50)
print("GROUND TRUTH COMPARISON:")
print("="*50)
print(f"Ground Truth Input:  '{sample_step['input']}'")
print(f"Ground Truth Output: '{sample_step['output']}'")

# Check exact matches
input_match = pred_input.lower().strip() == sample_step['input'].lower().strip()
output_match = pred_output.lower().strip() == sample_step['output'].lower().strip()

print(f"\n📊 Exact Match Results:")
print(f"Input Match: {'✅' if input_match else '❌'}")
print(f"Output Match: {'✅' if output_match else '❌'}")


🧪 Testing GPT-4 Prediction:

⏳ Generating prediction...

MODEL RESPONSE:
Input: NA
Output: NA

PARSED PREDICTIONS:
Predicted Input: 'NA'
Predicted Output: 'NA'

GROUND TRUTH COMPARISON:
Ground Truth Input:  'NA'
Ground Truth Output: 'NA'

📊 Exact Match Results:
Input Match: ✅
Output Match: ✅


In [13]:
def run_batch_predictions(split: str = "val", max_samples: int = None, save_interval: int = 50):
    """Run batch predictions on the dataset with progress tracking.

    Args:
        split: Dataset split to process ('val' or 'train')
        max_samples: Maximum number of samples to process (None for all)
        save_interval: Save results every N samples

    Returns:
        pd.DataFrame: Results with predictions and ground truth
    """
    print(f"🚀 Starting batch prediction on {split} split...")

    # Collect all steps first to get total count
    all_steps = list(data_loader.iter_tables(split))

    if max_samples:
        all_steps = all_steps[:max_samples]

    print(f"📊 Processing {len(all_steps)} recipe steps")

    results = []
    system_msg = prompt_generator.get_system_message()

    # Process with progress bar
    for i, step in enumerate(tqdm(all_steps, desc="Generating predictions")):
        try:
            # Generate prompt
            prompt = prompt_generator.make_prompt(step)

            # Get prediction
            response = predictor.predict(prompt, system_msg)

            # Parse response
            pred_input, pred_output = predictor.parse_io(response)

            # Store result
            result = {
                'instructions': step['instructions'],
                'actions': step['actions'],
                'input': step['input'],
                'output': step['output'],
                'pred_input': pred_input,
                'pred_output': pred_output,
                'response': response  # Keep full response for debugging
            }
            results.append(result)

            # Add delay for rate limiting
            predictor.add_delay()

            # Periodic saving
            if (i + 1) % save_interval == 0:
                temp_df = pd.DataFrame(results)
                temp_df.to_csv(f"temp_predictions_{i+1}.csv", index=False)
                print(f"💾 Saved temporary results at step {i+1}")

        except Exception as e:
            print(f"❌ Error processing step {i+1}: {e}")
            # Add placeholder result to maintain alignment
            result = {
                'instructions': step['instructions'],
                'actions': step['actions'],
                'input': step['input'],
                'output': step['output'],
                'pred_input': '',
                'pred_output': '',
                'response': f'ERROR: {str(e)}'
            }
            results.append(result)
            continue

    # Convert to DataFrame
    df = pd.DataFrame(results)

    print(f"✅ Batch processing complete! Processed {len(df)} steps")
    print(f"📊 Success rate: {(df['pred_input'] != '').sum()}/{len(df)} ({(df['pred_input'] != '').mean():.1%})")

    return df

print("✅ Batch processing function ready!")


✅ Batch processing function ready!


In [14]:
# Run a small test first (10 samples)
print("🧪 Running small test with 10 samples...")
test_df = run_batch_predictions(split="val", max_samples=10)

# Display sample results
print("\n📋 Sample Results:")
display_cols = ['instructions', 'actions', 'input', 'output', 'pred_input', 'pred_output']
print(test_df[display_cols].head())

# Quick evaluation
input_matches = (test_df['input'].str.lower().str.strip() ==
                test_df['pred_input'].str.lower().str.strip()).sum()
output_matches = (test_df['output'].str.lower().str.strip() ==
                 test_df['pred_output'].str.lower().str.strip()).sum()

print(f"\n📊 Quick Test Results:")
print(f"Input EMA: {input_matches}/{len(test_df)} ({input_matches/len(test_df):.1%})")
print(f"Output EMA: {output_matches}/{len(test_df)} ({output_matches/len(test_df):.1%})")
print(f"Average EMA: {(input_matches + output_matches)/(2*len(test_df)):.1%}")


🧪 Running small test with 10 samples...
🚀 Starting batch prediction on val split...
Found 744 recipe files in train
📊 Processing 10 recipe steps


Generating predictions: 100%|██████████| 10/10 [00:09<00:00,  1.00it/s]

✅ Batch processing complete! Processed 10 steps
📊 Success rate: 10/10 (100.0%)

📋 Sample Results:
                                   instructions  actions  \
0                    preheat the oven to 400f .  preheat   
1  heat the oil in a large non stick frying pan     heat   
2           add the onion , pepper and zucchini      add   
3        saute over a medium heat for 4 5mins .    saute   
4                                 add the herbs      add   

                                   input  \
0                                     NA   
1                                    oil   
2  (onion; pepper; zucchini; heated_oil)   
3               vegetables in heated_oil   
4     (herbs; sauteed vegetable mixture)   

                                           output               pred_input  \
0                                              NA                       NA   
1                                      heated_oil                      oil   
2  onion, pepper and zucchini added to hea




In [15]:
# 追加セル
!pip -q install bert_score transformers torch


In [16]:
# ------------------------------------------------------------
# Run a small test first (10 samples)
# ------------------------------------------------------------
print("🧪 Running small test with 10 samples...")
test_df = run_batch_predictions(split="val", max_samples=10)

# ------------- 結果プレビュー -------------
print("\n📋 Sample Results:")
display_cols = ['instructions', 'actions', 'input', 'output',
                'pred_input', 'pred_output']
print(test_df[display_cols].head())

# ------------- Exact‑Match Accuracy (EMA) -------------
input_matches = (
    test_df['input'].str.lower().str.strip() ==
    test_df['pred_input'].str.lower().str.strip()
).sum()

output_matches = (
    test_df['output'].str.lower().str.strip() ==
    test_df['pred_output'].str.lower().str.strip()
).sum()

# ------------- BERTScore (直接計算) -------------
import evaluate, numpy as np
bertscore = evaluate.load("bertscore")

# 空値対策
pred_inputs  = test_df['pred_input'].fillna("").tolist()
true_inputs  = test_df['input'].fillna("").tolist()
pred_outputs = test_df['pred_output'].fillna("").tolist()
true_outputs = test_df['output'].fillna("").tolist()

bert_in  = bertscore.compute(predictions=pred_inputs,  references=true_inputs,  lang="en")['f1']
bert_out = bertscore.compute(predictions=pred_outputs, references=true_outputs, lang="en")['f1']
# 追加セル
!pip -q install bert_score transformers torch

# ------------- まとめ表示 -------------
print(f"\n📊 Quick Test Results:")
print(f"Input EMA:   {input_matches}/{len(test_df)} "
      f"({input_matches/len(test_df):.1%})")
print(f"Output EMA:  {output_matches}/{len(test_df)} "
      f"({output_matches/len(test_df):.1%})")
print(f"Average EMA: {(input_matches + output_matches)/(2*len(test_df)):.1%}")
print(f"BERTScore Input F1:  {np.mean(bert_in):.3f}")
print(f"BERTScore Output F1: {np.mean(bert_out):.3f}")


🧪 Running small test with 10 samples...
🚀 Starting batch prediction on val split...
Found 744 recipe files in train
📊 Processing 10 recipe steps


Generating predictions: 100%|██████████| 10/10 [00:09<00:00,  1.05it/s]


✅ Batch processing complete! Processed 10 steps
📊 Success rate: 10/10 (100.0%)

📋 Sample Results:
                                   instructions  actions  \
0                    preheat the oven to 400f .  preheat   
1  heat the oil in a large non stick frying pan     heat   
2           add the onion , pepper and zucchini      add   
3        saute over a medium heat for 4 5mins .    saute   
4                                 add the herbs      add   

                                   input  \
0                                     NA   
1                                    oil   
2  (onion; pepper; zucchini; heated_oil)   
3               vegetables in heated_oil   
4     (herbs; sauteed vegetable mixture)   

                                           output               pred_input  \
0                                              NA                       NA   
1                                      heated_oil                      oil   
2  onion, pepper and zucchini added to hea

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 Quick Test Results:
Input EMA:   2/10 (20.0%)
Output EMA:  1/10 (10.0%)
Average EMA: 15.0%
BERTScore Input F1:  0.884
BERTScore Output F1: 0.864


In [18]:
# Run full batch processing (uncomment when ready)
# WARNING: This will process the entire validation split and may take 30-60 minutes
# and cost several dollars in API calls

print("⚠️  Ready to run full batch processing on validation split")
print("💰 Estimated cost: $3-10 depending on dataset size")
print("⏱️  Estimated time: 30-60 minutes")
print("\n🔧 To run full processing, uncomment the lines below:")
print("# full_df = run_batch_predictions(split='val')")
print("# full_df.to_csv('gpt4_predictions_full.csv', index=False)")
print("# print('💾 Full results saved to gpt4_predictions_full.csv')")

# Uncomment these lines when ready to run full experiment:
full_df = run_batch_predictions(split="val")
full_df.to_csv("gpt4_predictions_full.csv", index=False)
print("💾 Full results saved to gpt4_predictions_full.csv")


⚠️  Ready to run full batch processing on validation split
💰 Estimated cost: $3-10 depending on dataset size
⏱️  Estimated time: 30-60 minutes

🔧 To run full processing, uncomment the lines below:
# full_df = run_batch_predictions(split='val')
# full_df.to_csv('gpt4_predictions_full.csv', index=False)
# print('💾 Full results saved to gpt4_predictions_full.csv')
🚀 Starting batch prediction on val split...
Found 744 recipe files in train
📊 Processing 3069 recipe steps


Generating predictions:   2%|▏         | 50/3069 [00:47<46:11,  1.09it/s]

💾 Saved temporary results at step 50


Generating predictions:   3%|▎         | 100/3069 [01:48<1:39:08,  2.00s/it]

💾 Saved temporary results at step 100


Generating predictions:   5%|▍         | 150/3069 [02:39<50:37,  1.04s/it]

💾 Saved temporary results at step 150


Generating predictions:   7%|▋         | 200/3069 [03:27<46:47,  1.02it/s]

💾 Saved temporary results at step 200


Generating predictions:   8%|▊         | 250/3069 [04:18<47:01,  1.00s/it]

💾 Saved temporary results at step 250


Generating predictions:  10%|▉         | 300/3069 [05:09<47:42,  1.03s/it]

💾 Saved temporary results at step 300


Generating predictions:  11%|█▏        | 350/3069 [05:59<43:00,  1.05it/s]

💾 Saved temporary results at step 350


Generating predictions:  13%|█▎        | 400/3069 [06:53<59:27,  1.34s/it]  

💾 Saved temporary results at step 400


Generating predictions:  15%|█▍        | 450/3069 [07:41<42:45,  1.02it/s]

💾 Saved temporary results at step 450


Generating predictions:  16%|█▋        | 500/3069 [08:34<39:06,  1.09it/s]

💾 Saved temporary results at step 500


Generating predictions:  18%|█▊        | 550/3069 [09:24<38:12,  1.10it/s]

💾 Saved temporary results at step 550


Generating predictions:  20%|█▉        | 600/3069 [10:09<38:46,  1.06it/s]

💾 Saved temporary results at step 600


Generating predictions:  21%|██        | 650/3069 [11:02<37:10,  1.08it/s]

💾 Saved temporary results at step 650


Generating predictions:  23%|██▎       | 700/3069 [11:53<34:58,  1.13it/s]

💾 Saved temporary results at step 700


Generating predictions:  24%|██▍       | 750/3069 [12:44<42:30,  1.10s/it]

💾 Saved temporary results at step 750


Generating predictions:  26%|██▌       | 800/3069 [13:37<47:39,  1.26s/it]

💾 Saved temporary results at step 800


Generating predictions:  28%|██▊       | 850/3069 [14:29<36:44,  1.01it/s]

💾 Saved temporary results at step 850


Generating predictions:  29%|██▉       | 900/3069 [15:25<35:42,  1.01it/s]

💾 Saved temporary results at step 900


Generating predictions:  31%|███       | 950/3069 [16:22<36:09,  1.02s/it]

💾 Saved temporary results at step 950


Generating predictions:  33%|███▎      | 1000/3069 [17:14<30:21,  1.14it/s]

💾 Saved temporary results at step 1000


Generating predictions:  34%|███▍      | 1050/3069 [18:07<41:45,  1.24s/it]

💾 Saved temporary results at step 1050


Generating predictions:  36%|███▌      | 1100/3069 [18:58<36:42,  1.12s/it]

💾 Saved temporary results at step 1100


Generating predictions:  37%|███▋      | 1150/3069 [19:46<29:38,  1.08it/s]

💾 Saved temporary results at step 1150


Generating predictions:  39%|███▉      | 1200/3069 [20:41<29:51,  1.04it/s]

💾 Saved temporary results at step 1200


Generating predictions:  41%|████      | 1250/3069 [21:41<27:13,  1.11it/s]

💾 Saved temporary results at step 1250


Generating predictions:  42%|████▏     | 1300/3069 [22:29<24:26,  1.21it/s]

💾 Saved temporary results at step 1300


Generating predictions:  44%|████▍     | 1350/3069 [23:17<28:18,  1.01it/s]

💾 Saved temporary results at step 1350


Generating predictions:  46%|████▌     | 1400/3069 [24:19<31:00,  1.12s/it]

💾 Saved temporary results at step 1400


Generating predictions:  47%|████▋     | 1450/3069 [25:07<29:03,  1.08s/it]

💾 Saved temporary results at step 1450


Generating predictions:  49%|████▉     | 1500/3069 [25:59<25:14,  1.04it/s]

💾 Saved temporary results at step 1500


Generating predictions:  51%|█████     | 1550/3069 [26:49<24:40,  1.03it/s]

💾 Saved temporary results at step 1550


Generating predictions:  52%|█████▏    | 1600/3069 [27:43<28:24,  1.16s/it]

💾 Saved temporary results at step 1600


Generating predictions:  54%|█████▍    | 1650/3069 [28:35<22:13,  1.06it/s]

💾 Saved temporary results at step 1650


Generating predictions:  55%|█████▌    | 1700/3069 [29:24<22:48,  1.00it/s]

💾 Saved temporary results at step 1700


Generating predictions:  57%|█████▋    | 1750/3069 [30:16<23:15,  1.06s/it]

💾 Saved temporary results at step 1750


Generating predictions:  59%|█████▊    | 1800/3069 [31:05<20:27,  1.03it/s]

💾 Saved temporary results at step 1800


Generating predictions:  60%|██████    | 1850/3069 [31:55<23:29,  1.16s/it]

💾 Saved temporary results at step 1850


Generating predictions:  62%|██████▏   | 1900/3069 [32:45<18:35,  1.05it/s]

💾 Saved temporary results at step 1900


Generating predictions:  64%|██████▎   | 1950/3069 [33:39<17:24,  1.07it/s]

💾 Saved temporary results at step 1950


Generating predictions:  65%|██████▌   | 2000/3069 [34:30<16:04,  1.11it/s]

💾 Saved temporary results at step 2000


Generating predictions:  67%|██████▋   | 2050/3069 [35:19<16:10,  1.05it/s]

💾 Saved temporary results at step 2050


Generating predictions:  68%|██████▊   | 2100/3069 [36:07<17:37,  1.09s/it]

💾 Saved temporary results at step 2100


Generating predictions:  70%|███████   | 2150/3069 [36:59<16:05,  1.05s/it]

💾 Saved temporary results at step 2150


Generating predictions:  72%|███████▏  | 2200/3069 [37:51<12:58,  1.12it/s]

💾 Saved temporary results at step 2200


Generating predictions:  73%|███████▎  | 2250/3069 [38:39<13:00,  1.05it/s]

💾 Saved temporary results at step 2250


Generating predictions:  75%|███████▍  | 2300/3069 [39:31<12:04,  1.06it/s]

💾 Saved temporary results at step 2300


Generating predictions:  77%|███████▋  | 2350/3069 [40:20<12:53,  1.08s/it]

💾 Saved temporary results at step 2350


Generating predictions:  78%|███████▊  | 2400/3069 [41:14<11:18,  1.01s/it]

💾 Saved temporary results at step 2400


Generating predictions:  80%|███████▉  | 2450/3069 [42:06<19:44,  1.91s/it]

💾 Saved temporary results at step 2450


Generating predictions:  81%|████████▏ | 2500/3069 [42:55<09:19,  1.02it/s]

💾 Saved temporary results at step 2500


Generating predictions:  83%|████████▎ | 2550/3069 [43:44<08:01,  1.08it/s]

💾 Saved temporary results at step 2550


Generating predictions:  85%|████████▍ | 2600/3069 [44:35<06:55,  1.13it/s]

💾 Saved temporary results at step 2600


Generating predictions:  86%|████████▋ | 2650/3069 [45:22<06:20,  1.10it/s]

💾 Saved temporary results at step 2650


Generating predictions:  88%|████████▊ | 2700/3069 [46:08<05:35,  1.10it/s]

💾 Saved temporary results at step 2700


Generating predictions:  90%|████████▉ | 2750/3069 [47:00<04:48,  1.11it/s]

💾 Saved temporary results at step 2750


Generating predictions:  91%|█████████ | 2800/3069 [47:53<04:10,  1.07it/s]

💾 Saved temporary results at step 2800


Generating predictions:  93%|█████████▎| 2850/3069 [48:43<03:15,  1.12it/s]

💾 Saved temporary results at step 2850


Generating predictions:  94%|█████████▍| 2900/3069 [49:36<02:40,  1.05it/s]

💾 Saved temporary results at step 2900


Generating predictions:  96%|█████████▌| 2950/3069 [50:26<02:08,  1.08s/it]

💾 Saved temporary results at step 2950


Generating predictions:  98%|█████████▊| 3000/3069 [51:15<01:08,  1.01it/s]

💾 Saved temporary results at step 3000


Generating predictions:  99%|█████████▉| 3050/3069 [52:05<00:19,  1.05s/it]

💾 Saved temporary results at step 3050


Generating predictions: 100%|██████████| 3069/3069 [52:25<00:00,  1.03s/it]

✅ Batch processing complete! Processed 3069 steps
📊 Success rate: 3069/3069 (100.0%)
💾 Full results saved to gpt4_predictions_full.csv





In [19]:
class MetricsCalculator:
    """Computes evaluation metrics for model predictions."""

    def __init__(self):
        # Initialize evaluation libraries
        self.rouge = evaluate.load("rouge")
        self.bertscore = evaluate.load("bertscore")

    def calculate_ema(self, predictions: List[str], references: List[str]) -> float:
        """Calculate Exact Match Accuracy with normalization.

        Args:
            predictions: List of predicted strings
            references: List of ground truth strings

        Returns:
            float: EMA score (0.0 to 1.0)
        """
        if len(predictions) != len(references):
            raise ValueError("Predictions and references must have same length")

        matches = 0
        for pred, ref in zip(predictions, references):
            # Normalize strings for comparison
            pred_norm = self._normalize_string(pred)
            ref_norm = self._normalize_string(ref)

            if pred_norm == ref_norm:
                matches += 1

        return matches / len(predictions) if predictions else 0.0

    def calculate_rouge_l(self, predictions: List[str], references: List[str]) -> float:
        """Calculate Rouge-L score.

        Args:
            predictions: List of predicted strings
            references: List of ground truth strings

        Returns:
            float: Rouge-L F1 score
        """
        if not predictions or not references:
            return 0.0

        # Handle empty predictions
        clean_predictions = [pred if pred else "" for pred in predictions]
        clean_references = [ref if ref else "" for ref in references]

        try:
            results = self.rouge.compute(
                predictions=clean_predictions,
                references=clean_references
            )
            return results["rougeL"]
        except Exception as e:
            print(f"⚠️ Rouge-L calculation error: {e}")
            return 0.0

    def calculate_bertscore(self, predictions: List[str], references: List[str]) -> float:
        """Calculate BERTScore F1.

        Args:
            predictions: List of predicted strings
            references: List of ground truth strings

        Returns:
            float: Average BERTScore F1
        """
        if not predictions or not references:
            return 0.0

        # Handle empty predictions
        clean_predictions = [pred if pred else "empty" for pred in predictions]
        clean_references = [ref if ref else "empty" for ref in references]

        try:
            results = self.bertscore.compute(
                predictions=clean_predictions,
                references=clean_references,
                lang="en"
            )
            return np.mean(results["f1"])
        except Exception as e:
            print(f"⚠️ BERTScore calculation error: {e}")
            return 0.0

    def _normalize_string(self, text: str) -> str:
        """Normalize string for comparison."""
        if not isinstance(text, str):
            text = str(text)

        # Convert to lowercase and strip whitespace
        text = text.lower().strip()

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Handle special cases
        if text in ['na', 'n/a', 'none', '']:
            return ''

        return text

    def evaluate_predictions(self, df: pd.DataFrame) -> Dict:
        """Comprehensive evaluation of predictions.

        Args:
            df: DataFrame with predictions and ground truth

        Returns:
            Dict: All evaluation metrics
        """
        print("📊 Calculating evaluation metrics...")

        # Extract predictions and references
        pred_inputs = df['pred_input'].tolist()
        true_inputs = df['input'].tolist()
        pred_outputs = df['pred_output'].tolist()
        true_outputs = df['output'].tolist()

        # Calculate EMA
        ema_input = self.calculate_ema(pred_inputs, true_inputs)
        ema_output = self.calculate_ema(pred_outputs, true_outputs)
        ema_avg = (ema_input + ema_output) / 2

        # Calculate Rouge-L
        rouge_input = self.calculate_rouge_l(pred_inputs, true_inputs)
        rouge_output = self.calculate_rouge_l(pred_outputs, true_outputs)

        # Calculate BERTScore (focus on outputs as they're more complex)
        bertscore_output = self.calculate_bertscore(pred_outputs, true_outputs)

        metrics = {
            'ema_input': ema_input,
            'ema_output': ema_output,
            'ema_average': ema_avg,
            'rouge_l_input': rouge_input,
            'rouge_l_output': rouge_output,
            'bertscore_f1': bertscore_output,
            'total_samples': len(df)
        }

        return metrics

# Initialize metrics calculator
metrics_calc = MetricsCalculator()
print("✅ Metrics calculator initialized successfully!")


✅ Metrics calculator initialized successfully!


In [20]:
# Evaluate the test results
print("📊 Evaluating test results...")
test_metrics = metrics_calc.evaluate_predictions(test_df)

# Display results
print("\n" + "="*60)
print("TEST RESULTS (10 samples)")
print("="*60)
print(f"EMA Input:        {test_metrics['ema_input']:.1%}")
print(f"EMA Output:       {test_metrics['ema_output']:.1%}")
print(f"EMA Average:      {test_metrics['ema_average']:.1%}")
print(f"Rouge-L Input:    {test_metrics['rouge_l_input']:.3f}")
print(f"Rouge-L Output:   {test_metrics['rouge_l_output']:.3f}")
print(f"BERTScore F1:     {test_metrics['bertscore_f1']:.3f}")
print(f"Total Samples:    {test_metrics['total_samples']}")

# Compare with paper benchmarks
paper_benchmarks = {
    'ema_average': 0.267,  # 26.7%
    'rouge_l_input': 0.514,  # 51.4
    'rouge_l_output': 0.523  # 52.3
}

print("\n" + "="*60)
print("COMPARISON WITH PAPER BENCHMARKS")
print("="*60)
print(f"{'Metric':<20} {'Test':<10} {'Paper':<10} {'Status':<10}")
print("-" * 60)

for metric, paper_value in paper_benchmarks.items():
    test_value = test_metrics[metric]
    status = "✅ BETTER" if test_value >= paper_value else "❌ LOWER"
    print(f"{metric:<20} {test_value:<10.3f} {paper_value:<10.3f} {status}")

print("\n⚠️  Note: These are results on only 10 test samples.")
print("📈 Run full evaluation for meaningful comparison with paper.")


📊 Evaluating test results...
📊 Calculating evaluation metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



TEST RESULTS (10 samples)
EMA Input:        20.0%
EMA Output:       10.0%
EMA Average:      15.0%
Rouge-L Input:    0.457
Rouge-L Output:   0.378
BERTScore F1:     0.864
Total Samples:    10

COMPARISON WITH PAPER BENCHMARKS
Metric               Test       Paper      Status    
------------------------------------------------------------
ema_average          0.150      0.267      ❌ LOWER
rouge_l_input        0.457      0.514      ❌ LOWER
rouge_l_output       0.378      0.523      ❌ LOWER

⚠️  Note: These are results on only 10 test samples.
📈 Run full evaluation for meaningful comparison with paper.


In [21]:
def evaluate_full_results(csv_path: str = "gpt4_predictions_full.csv"):
    """Evaluate full results and compare with paper benchmarks.

    Args:
        csv_path: Path to the CSV file with full predictions
    """
    try:
        # Load results
        df = pd.read_csv(csv_path, keep_default_na=False)
        print(f"📁 Loaded {len(df)} predictions from {csv_path}")

        text_cols = ['input', 'output', 'pred_input', 'pred_output']
        df[text_cols] = df[text_cols].fillna('').astype(str)
        # Calculate metrics
        metrics = metrics_calc.evaluate_predictions(df)

        # Paper benchmarks
        paper_benchmarks = {
            'EMA Average': {'our': metrics['ema_average'], 'paper': 0.267, 'format': '.1%'},
            'Rouge-L Input': {'our': metrics['rouge_l_input'], 'paper': 0.514, 'format': '.3f'},
            'Rouge-L Output': {'our': metrics['rouge_l_output'], 'paper': 0.523, 'format': '.3f'},
            'BERTScore F1': {'our': metrics['bertscore_f1'], 'paper': None, 'format': '.3f'}
        }

        # Display comprehensive results
        print("\n" + "="*80)
        print("🎯 FINAL RESULTS - PAPER REPRODUCTION")
        print("="*80)

        print(f"📊 Dataset: PizzaCommonSense validation split ({metrics['total_samples']} samples)")
        print(f"🤖 Model: GPT-4 Turbo (gpt-4o-mini) with Chain-of-Thought")
        print(f"📅 Evaluation Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}")

        print("\n" + "-"*80)
        print(f"{'Metric':<20} {'Our Result':<15} {'Paper Benchmark':<18} {'Status':<15} {'Difference'}")
        print("-"*80)

        success_count = 0
        total_comparable = 0

        for metric_name, values in paper_benchmarks.items():
            our_val = values['our']
            paper_val = values['paper']
            fmt = values['format']

            if paper_val is not None:
                total_comparable += 1
                diff = our_val - paper_val
                status = "✅ BETTER" if our_val >= paper_val else "❌ LOWER"
                if our_val >= paper_val:
                    success_count += 1

                our_str = f"{our_val:{fmt}}"
                paper_str = f"{paper_val:{fmt}}"
                diff_str = f"{diff:+{fmt}}"
            else:
                status = "📊 NEW METRIC"
                our_str = f"{our_val:{fmt}}"
                paper_str = "N/A"
                diff_str = "N/A"

            print(f"{metric_name:<20} {our_str:<15} {paper_str:<18} {status:<15} {diff_str}")

        # Success summary
        print("\n" + "="*80)
        print("🏆 REPRODUCTION SUCCESS SUMMARY")
        print("="*80)

        success_rate = success_count / total_comparable if total_comparable > 0 else 0
        print(f"✅ Metrics meeting/exceeding paper: {success_count}/{total_comparable} ({success_rate:.1%})")

        if success_count > 0:
            print("🎉 SUCCESS: At least one metric meets the paper benchmark!")
            print("📈 Reproduction experiment successful!")
        else:
            print("⚠️  No metrics exceed paper benchmarks")
            print("🔍 Consider adjusting prompts or trying different techniques")

        # Detailed breakdown
        print("\n" + "-"*80)
        print("📋 DETAILED BREAKDOWN")
        print("-"*80)
        print(f"EMA Input:        {metrics['ema_input']:.1%}")
        print(f"EMA Output:       {metrics['ema_output']:.1%}")
        print(f"EMA Average:      {metrics['ema_average']:.1%} (Target: 26.7%)")
        print(f"Rouge-L Input:    {metrics['rouge_l_input']:.3f} (Target: 0.514)")
        print(f"Rouge-L Output:   {metrics['rouge_l_output']:.3f} (Target: 0.523)")
        print(f"BERTScore F1:     {metrics['bertscore_f1']:.3f} (New metric)")

        # Save metrics to JSON
        metrics_file = "final_metrics.json"
        with open(metrics_file, 'w') as f:
            json.dump(metrics, f, indent=2)
        print(f"\n💾 Detailed metrics saved to {metrics_file}")

        return metrics

    except FileNotFoundError:
        print(f"❌ File {csv_path} not found. Run full batch processing first.")
        return None
    except Exception as e:
        print(f"❌ Error evaluating results: {e}")
        return None

print("✅ Full evaluation function ready!")


✅ Full evaluation function ready!


In [22]:
# Run full evaluation (uncomment after running full batch processing)
print("🎯 Ready to evaluate full results")

print("\n🔧 To run full evaluation, uncomment the line below:")
print("# final_metrics = evaluate_full_results('gpt4_predictions_full.csv')")

# Uncomment this line after running full batch processing:
final_metrics = evaluate_full_results("gpt4_predictions_full.csv")


🎯 Ready to evaluate full results

🔧 To run full evaluation, uncomment the line below:
# final_metrics = evaluate_full_results('gpt4_predictions_full.csv')
📁 Loaded 3069 predictions from gpt4_predictions_full.csv
📊 Calculating evaluation metrics...

🎯 FINAL RESULTS - PAPER REPRODUCTION
📊 Dataset: PizzaCommonSense validation split (3069 samples)
🤖 Model: GPT-4 Turbo (gpt-4o-mini) with Chain-of-Thought
📅 Evaluation Date: 2025-07-21 17:54

--------------------------------------------------------------------------------
Metric               Our Result      Paper Benchmark    Status          Difference
--------------------------------------------------------------------------------
EMA Average          16.6%           26.7%              ❌ LOWER         -10.1%
Rouge-L Input        0.446           0.514              ❌ LOWER         -0.068
Rouge-L Output       0.377           0.523              ❌ LOWER         -0.146
BERTScore F1         0.867           N/A                📊 NEW METRIC    N/A

🏆