In [11]:
! pip install datasets transformers torch pandas numpy tqdm openpyxl ipywidgets huggingface_hub openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Downloading openai-1.62.0-py3-none-any.whl (464 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.8/464.8 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.0/345.0 KB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 KB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting sniffio
  Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting anyio<5,>=3.5.0
  Downloading anyio-4.8.0-py3-none-any.whl (96 kB)
[2K     [90m━━

In [2]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import Markdown, display
from tqdm import tqdm 
import torch
from typing import List, Tuple
import random
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [3]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_csv("Data/gpqa_diamond.csv")

## Load Checkpoint Model

In [35]:
# Replace 'path/to/your/output_dir' with the actual path (e.g., args.output_dir)
model_path = "../ckpts/s1_20250213_023116"

# Load the model and tokenizer from the directory where you saved them
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Evaluate Responses

In [36]:
def get_question(row: pd.Series) -> Tuple[str, List[str]]:
        question = row['Question']
        correct_answer = row['Correct Answer']
        incorrect_answers = [
            row['Incorrect Answer 1'],
            row['Incorrect Answer 2'],
            row['Incorrect Answer 3']
        ]
        options = [correct_answer] + incorrect_answers
        shuffled_options = list(enumerate(options))
        random.shuffle(shuffled_options)
        
        formatted_question = f"{question}\n\nOptions:\n"
        for i, option in shuffled_options:
            formatted_question += f"- {option}\n"
        
        return formatted_question, [option for _, option in shuffled_options]

def get_correct_answer(row: pd.Series) -> str:
        return row['Correct Answer']

def check_answer(model_answer: str, correct_answer: str) -> bool:
        try:
            return model_answer.strip().lower() == correct_answer
        except (ValueError, IndexError):
            return False

def query_qwen2_5_think(user_message: str, model, tokenizer) -> str:
    """
    Queries the Qwen2.5-14-Instruct model with the provided user_message
    and returns the assistant's response.

    Args:
        user_message (str): The user query.
        model: The loaded Qwen2.5 model.
        tokenizer: The Qwen2.5 tokenizer.

    Returns:
        str: The assistant's reply.
    """
    # Define the system message
    system_message = "You are a helpful AI assistant. Succinctly answer the provided question."
    system_message2 = "Think"
    
    # Additional query instructing the assistant to think before answering
    additional_instruction = ("Reply only with the correct option.")
    
    # Format the prompt using ChatML format with an extra user message
    formatted_prompt = (
        f"<|im_start|>system\n{system_message}<|im_end|>\n"
        f"<|im_start|>user\n{user_message}<|im_end|>\n"
        f"<|im_start|>system\n{system_message2}<|im_end|>\n"
        f"<|im_start|>system\n{additional_instruction}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    # Set the device and prepare inputs
    device = "cuda"
    encoded_inputs = tokenizer(formatted_prompt,
                               return_tensors="pt",
                               padding=False)
    inputs = encoded_inputs["input_ids"].to(device)
    attention_mask = encoded_inputs["attention_mask"].to(device)
    model = model.to(device)

    # Generate the model output with a sufficient token budget and proper EOS handling
    outputs = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=32768,
        eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>")
    )

    # Decode the output (keeping the special tokens for extraction)
    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract the assistant's response from the output
    assistant_part = raw_output.split("<|im_start|>assistant")[-1]
    assistant_response = assistant_part.split("<|im_end|>")[0].strip()

    return assistant_response


In [None]:
from tqdm import tqdm 

for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating Responses"):
    if index > 100: 
        break
    question = get_question(row)
    model_answer = query_qwen2_5_think(question[0], model, tokenizer)
    df.loc[index, "Adjusted_Model_Answer"] = model_answer
    df.to_excel("Results_Checkpoint_Model_Short.xlsx")



Generating Responses:   0%|          | 0/198 [00:00<?, ?it/s]

## Load Original Model

In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

2025-02-13 16:18:15.316197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739463495.330035  139865 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739463495.333897  139865 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [33]:
def query_qwen2_5(user_message: str, model, tokenizer) -> str:
    """
    Queries the Qwen2.5-14-Instruct model with the provided user_message
    and returns the assistant's response.

    Args:
        user_message (str): The user query.
        model: The loaded Qwen2.5 model.
        tokenizer: The Qwen2.5 tokenizer.

    Returns:
        str: The assistant's reply.
    """
    # Define the system message
    system_message = "You are a helpful AI assistant. Succinctly answer the provided question."
    system_message2 = "Think"
    
    # Additional query instructing the assistant to think before answering
    additional_instruction = ("Reply only with the correct option.")
    
    # Format the prompt using ChatML format with an extra user message
    formatted_prompt = (
        f"<|im_start|>system\n{system_message}<|im_end|>\n"
        f"<|im_start|>user\n{user_message}<|im_end|>\n"
        f"<|im_start|>system\n{system_message2}<|im_end|>\n"
        f"<|im_start|>system\n{additional_instruction}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    # Set the device and prepare inputs
    device = "cuda"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    model = model.to(device)

    # Generate the model output with a sufficient token budget and proper EOS handling
    outputs = model.generate(
        **inputs,
        max_new_tokens=32768,
        eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>")
    )

    # Decode the output (keeping the special tokens for extraction)
    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract the assistant's response from the output
    assistant_part = raw_output.split("<|im_start|>assistant")[-1]
    assistant_response = assistant_part.split("<|im_end|>")[0].strip()

    return assistant_response

In [34]:
from tqdm import tqdm 

for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating Responses"):
    if index > 100: 
        break
    question = get_question(row)
    model_answer = query_qwen2_5(question[0], model, tokenizer)
    df.loc[index, "Original_Model_Answer"] = model_answer
    df.to_excel("Results_Original_Model_Short.xlsx")

Generating Responses:  51%|█████     | 101/198 [01:54<01:49,  1.13s/it]


## OpenAI GPT Model

In [9]:
import openai
from openai import OpenAI

client = OpenAI(api_key=openai_api_key)

completion = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "developer", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)


ChatCompletionMessage(content='Hello! How can I assist you today?', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [31]:
def query_openai(question):
    client = OpenAI(api_key=openai_api_key)
    completion = client.chat.completions.create(
          model="gpt-4o",
          temperature=0,
          messages=[
            {"role": "developer", "content": "You are a helpful assistant."},
            {"role": "user", "content": question},
          ]
        )

    return completion.choices[0].message.content


from tqdm import tqdm 

for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating Responses"):
    if index > 100: 
        break
    question = get_question(row)
    model_answer = query_openai(question[0])
    df.loc[index, "GPT4o_Model_Answer"] = model_answer
    df.to_excel("Results_GPT4o.xlsx")

Generating Responses:  51%|█████     | 101/198 [25:11<24:11, 14.97s/it] 
