In [None]:
!pip install datasets
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2ead40fcd1683c6075313a9769af311ba4f0939bffac4811565c006d8e15721a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import openai
import time
import numpy as np
from datasets import load_dataset
from rouge_score import rouge_scorer

# OpenAI API Key (replace with your actual key)
OPENAI_API_KEY = "your-openai-api-key"

# Initialize OpenAI client
client = openai.Client(api_key=OPENAI_API_KEY)

# Load test dataset
dataset_name = "lamini/taylor_swift"
test_dataset = load_dataset(dataset_name, split="test")

# Define system message as part of the user prompt (since `o1-preview` does not support system messages)
SYSTEM_PROMPT = """You are a Taylor Swift expert. Answer CORRECTLY and CONCISELY questions about Taylor Swift's life, achievements, songs, and more."""

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# Function to generate responses using OpenAI's `o1-preview`
def generate_response_openai(prompt):
    """Generates a response using OpenAI's `o1-preview` model."""
    start_time = time.time()  # Start timer

    full_prompt = f"{SYSTEM_PROMPT}\n\n{prompt}"  # Concatenate system message to user input

    try:
        response = client.chat.completions.create(
            model="o1-preview",
            messages=[{"role": "user", "content": full_prompt}],
            #max_completion_tokens=200  # Corrected parameter name
        )
        generated_text = response.choices[0].message.content.strip()
        inference_time = time.time() - start_time  # Measure inference time
        return generated_text, inference_time

    except Exception as e:
        print(f"Error generating response: {e}")
        return "", 10.0  # Assign default long inference time in case of failure

# Evaluate on test set
total_rouge_l = []
inference_times = []

for i, sample in enumerate(test_dataset):
    prompt = sample["question"]
    reference_answer = sample["answer"]

    generated_response, inference_time = generate_response_openai(prompt)

    # Compute ROUGE-L score
    rouge_scores = scorer.score(reference_answer, generated_response)
    rouge_l_f1 = rouge_scores['rougeL'].fmeasure

    # Store scores and time
    total_rouge_l.append(rouge_l_f1)
    inference_times.append(inference_time)

    # Print sample results every 10 examples
    if i % 10 == 0:
        print(f"\nSample {i}:")
        print(f"Q: {prompt}")
        print(f"Reference: {reference_answer}")
        print(f"Response: {generated_response}")
        print(f"ROUGE-L F1: {rouge_l_f1:.4f}")
        print(f"Inference Time: {inference_time:.4f} sec\n")

# Compute final metrics
average_rouge_l = np.mean(total_rouge_l)
average_inference_time = np.mean(inference_times)

print(f"\n===== Final Results =====")
print(f"Average ROUGE-L F1 score: {average_rouge_l:.4f}")
print(f"Average inference time per response: {average_inference_time:.4f} sec")


Sample 0:
Q: Has Taylor Swift written songs for other artists?
Reference: Yes, Taylor Swift has written songs for other artists. Some notable examples include This Is What You Came For by Calvin Harris featuring Rihanna, Better Man by Little Big Town, and You'll Always Find Your Way Back Home by Miley Cyrus.
Response: Yes, Taylor Swift has written songs for other artists. Notably, she co-wrote "This Is What You Came For" by Calvin Harris featuring Rihanna under the pseudonym Nils Sjöberg. She also wrote "Better Man" for Little Big Town and "Babe" for Sugarland.
ROUGE-L F1: 0.6353
Inference Time: 5.8229 sec


Sample 10:
Q: What is the opinion of Sasha Frere-Jones of The New Yorker about Taylor Swift's performance?
Reference: Sasha Frere-Jones of The New Yorker called Taylor Swift a "preternaturally skilled" performer with a vibrant stage presence.
Response: Sasha Frere-Jones, a former music critic for *The New Yorker*, has offered mixed opinions about Taylor Swift's performances. While

In [None]:
===== Final Results =====
Average ROUGE-L F1 score: 0.3295
Average inference time per response: 15.4833 sec

''