In [1]:
import torch
from torch import nn
import math
import numpy as np
from torch.nn import functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
folder_path = '/content/drive/My Drive/NLP/Project/reports'
files = os.listdir(folder_path) #.json files generated from the Quiplash game
print(files)

['HHWI3.json', 'CKHQ6.json', 'IRBZ.json', 'IRBZ2.json', 'HHWI2.json', 'CKHQ.json', 'CKHQ5.json', 'HHWI.json', 'CKHQ7.json', 'CKHQ3.json', 'CKHQ2.json', 'CKHQ4.json', 'CKHQ8.json']


In [4]:
data = []
for file in files:
  with open(os.path.join(folder_path, file), 'r') as f:
    raw_data = json.load(f)
    data += list(raw_data.items())[3:]
len(data)

52

In [5]:
data[10]

('Invent a holiday that you think everyone would enjoy',
 {'free icecream day': {'submitter': 'yuri', 'votes': ['yun', 'cr']},
  'holiday month': {'submitter': 'lily', 'votes': []},
  'National Nap Day: snooze your worries away': {'submitter': 'AI (gpt)',
   'votes': [],
   'aiVotes': ['yun', 'cr']},
  'winningAnswer': 'free icecream day',
  'mostLikelyAiAnswer': 'National Nap Day: snooze your worries away'})

In [6]:
aiSubmitters = []
inputs = []
responses = []
winningAnswers = []
for input, subdict in data:
    inputs.append(input)
    responses.append(list(subdict.keys())[:-2])
    winningAnswers.append(subdict['winningAnswer'])
    idx = 0
    for sub_key, value in subdict.items():
        if type(value) == dict:
          if "AI" in value['submitter']:
            aiSubmitters.append(str(idx + 1))
          idx += 1

In [7]:
import numpy as np

# Function to find indices using boolean mask
def find_indices_with_mask(responses, target):
    indices = []

    for idx, item in enumerate(target):
        mask = np.isin(responses[idx], item)
        indices.append(str(1 + np.where(mask)[0][0]))

    return indices

In [8]:
winningAnswerIdx = find_indices_with_mask(responses, winningAnswers)
print(winningAnswerIdx[10])

1


## Prompts

In [9]:
funny_task = ["You are a judge tasked with selecting the funniest response based on creativity, wordplay, and overall comedic impact. \
              The responses were generated based on this input: ", " Which response is the funniest? 1, 2, or 3."]

general_task = " Your answer should be either 1, 2, or 3." #, and explain."

ai_task = ["You are a judge tasked with choosing which response to an input was generated by a large language model (LLM). \
           The responses were generated based on this input: ", " Which response was generated by a LLM? 1, 2, or 3."]


In [10]:
def generate_system_task(task, prompt):
  return task[0] + "\""+ prompt + "\"."+ task[1] + general_task

In [11]:
def generate_user_prompt(responses, prompt):
  user_prompt = "The three responses to the input: \"" + prompt + "\" are: "
  for i, r in enumerate(responses):
    user_prompt += "Response " + str(i+1) + " =\"" + r + "\" "
  user_prompt += "Please respond with either 1, 2, or 3." # and explain."
  return user_prompt

# Messages

In [12]:
def generate_messages(task, inputs, responses):
     messages = []
     for idx, input in enumerate(inputs):
          message = [{"role": "system", "content":generate_system_task(task, input)},
                     {"role": "user", "content": generate_user_prompt(responses[idx], input)}]
          messages.append(message)
     return messages

messages = generate_messages(ai_task, inputs, responses)
messages

[[{'role': 'system',
   'content': 'You are a judge tasked with choosing which response to an input was generated by a large language model (LLM).            The responses were generated based on this input: "Odd new shampoo instructions: “Lather, Rinse, _________, Repeat.”". Which response was generated by a LLM? 1, 2, or 3. Your answer should be either 1, 2, or 3.'},
  {'role': 'user',
   'content': 'The three responses to the input: "Odd new shampoo instructions: “Lather, Rinse, _________, Repeat.”" are: Response 1 ="Sing baby from Justin beiver" Response 2 ="Dance" Response 3 ="Lather, Rinse, SCREAM INTO THE VOID, Repeat" Please respond with either 1, 2, or 3.'}],
 [{'role': 'system',
   'content': 'You are a judge tasked with choosing which response to an input was generated by a large language model (LLM).            The responses were generated based on this input: "What’s really in a camel’s hump?". Which response was generated by a LLM? 1, 2, or 3. Your answer should be either

# GPT

In [13]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.5
    Uninstalling openai-1.54.5:
      Successfully uninstalled openai-1.54.5
Successfully installed openai-0.28.0


In [None]:
import openai

# Set your OpenAI API key
openai.api_key = 'add_your_key'

In [15]:
def evaluate_gpt(inputs, responses, task):

    messages = generate_messages(task, inputs, responses)

    outputs = []

    for msg in messages:
        response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo",  # Or use "gpt-3.5-turbo" for GPT-3.5
                      messages=msg,
                      max_tokens=100
                  )
        outputs.append(response['choices'][0]['message']['content'].strip())

    return outputs

In [16]:
funny_outputs = evaluate_gpt(inputs, responses, funny_task)
funny_outputs

['I would select Response 3 as the funniest.',
 'I choose response 3.',
 'I believe response 1 is the funniest.',
 'I choose response 3: "Eau de Golden Girl".',
 'I choose Response 3: "pineapple on pizza is a war crime"',
 'As a judge, I select Response 1: "a skunk" as the funniest response.',
 '3',
 'I choose response 3: "Bring your mom."',
 '3',
 'I choose Response 3 ="barkAtTheMoon666"',
 'I choose response 3.',
 '2',
 'I choose response 3: "unicorn riding lessons."',
 'I choose response 2 = "see other people\'s poop".',
 'I choose response 3: "making your wallet disappear".',
 'I choose response 3: "yeet meat into the cheap seats"',
 '2',
 'I choose response 2.',
 '3',
 'I choose response 3: "My Hamster Wrote This."',
 'I choose response 3.',
 'I choose response 2.',
 'I choose response 1.',
 'I choose response 1.',
 '2',
 'I choose Response 3: "a really long nap"',
 'I choose response 2 = "pec dance"',
 'I choose response 3.',
 'I choose Response 3 ="kale and existential dread jui

In [17]:
ai_outputs = evaluate_gpt(inputs, responses, ai_task)
ai_outputs

['The response generated by a large language model (LLM) is Response 3 ="Lather, Rinse, SCREAM INTO THE VOID, Repeat." Therefore, the answer is 3.',
 'The response generated by a LLM is: 2',
 'The response generated by a large language model is Response 3 = "your family tree is from Chernobyl". Therefore, the answer is 3.',
 '2',
 'The response generated by a large language model (LLM) is Response 3: "pineapple on pizza is a war crime."',
 '2',
 '3',
 'The response generated by a large language model is 3.',
 '2',
 '2',
 'The response generated by a large language model (LLM) is Response 3 = "National Nap Day: snooze your worries away."',
 'The response generated by a LLM is: 3',
 'The response generated by a large language model (LLM) is Response 3: "unicorn riding lessons." \n\nTherefore, the answer is 3.',
 'The response generated by a large language model is: 3',
 'The response generated by a large language model is: Response 3 = "making your wallet disappear" \n\nTherefore, my ans

In [18]:
def evaluateMetric(output, humanTarget):
    count_match = 0
    problematic_outputs_idx = []
    for idx, (target, out) in enumerate(zip(humanTarget, output)):
        if target in out:
            count_match += 1
        if target not in ["1", "2", "3"]:
            problematic_outputs_idx.append(idx)

    return count_match, problematic_outputs_idx

In [19]:
def listProblematicOutputs(problematic_outputs_idx, outputs, answers):
  if len(problematic_outputs_idx) > 0:
    mask = np.array(problematic_outputs_idx)
    problematic_outputs = np.array(outputs)[mask]
    problematic_answers = np.array(answers)[mask]

    for idx, out, answer in zip(problematic_outputs_idx, problematic_outputs, problematic_answers):
      print(idx, out, answer)

In [23]:
n = 20

In [24]:
total_count = 0
for i in range(n):
    funny_outputs = evaluate_gpt(inputs, responses, funny_task)
    count_funny, problematic_funny_idx = evaluateMetric(funny_outputs, winningAnswerIdx)
    print("count correct: ",count_funny)
    total_count += count_funny
    listProblematicOutputs(problematic_funny_idx, funny_outputs, winningAnswerIdx)
print(total_count, total_count / n)

count correct:  31
count correct:  27
count correct:  28
count correct:  26
count correct:  26
count correct:  28
count correct:  29
count correct:  26
count correct:  26
count correct:  29
count correct:  27
count correct:  26
count correct:  27
count correct:  30
count correct:  29
count correct:  28
count correct:  27
count correct:  25
count correct:  24
count correct:  23
542 27.1


In [25]:
total_ai_count = 0
for i in range(n):
    ai_outputs = evaluate_gpt(inputs, responses, ai_task)
    count_aiSubmitters, problematic_ai_idx = evaluateMetric(ai_outputs, aiSubmitters)
    print("count correct: ",count_aiSubmitters)
    total_ai_count += count_aiSubmitters
    listProblematicOutputs(problematic_ai_idx, ai_outputs, count_aiSubmitters)
print(total_ai_count, total_ai_count / n)

count correct:  30
count correct:  25
count correct:  27
count correct:  28
count correct:  25
count correct:  30
count correct:  26
count correct:  27
count correct:  26
count correct:  25
count correct:  26
count correct:  27
count correct:  24
count correct:  29
count correct:  25
count correct:  29
count correct:  27
count correct:  28
count correct:  29
count correct:  26
539 26.95


https://lmarena.ai/

# Llama

In [None]:
from huggingface_hub import notebook_login
notebook_login("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")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# TODO: Load the model using the appropriate parameters using AutoModelForCausalLM
# Ensure torch_dtype is set to torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# TODO: Initialize the tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def run_model(model, tokenizer, messages, max_new_tokens=5, verbose=False):
    # TODO: Prepare the input text using the tokenizer's apply_chat_template (Do not tokenize the text yet)
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)

    if verbose: print("\n###input_text:###\n", input_text)
    # TODO: Tokenize the input text and transfer it to the appropriate device
    input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)

    if verbose: print("\n###input_ids:###\n", input_ids)
    # TODO: Generate a response using the model. Ensure do_sample is False.
    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature = 1.0,
        top_p = 1.0,
        pad_token_id=tokenizer.eos_token_id
    )

    # TODO: Decode the output and return the response without special tokens
    # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
    response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)    # output[0][input_ids.shape[-1]:] REMOVES the inputs from response
    assistant_response = response.split("assistant")[1].strip() if "assistant" in response else response

    if verbose: print("\n###response:###\n", response)
    return assistant_response

In [None]:
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

def run_llama(model, tokenizer, inputs, responses, task):

    messages = generate_messages(task, inputs, responses)

    outputs = []

    for msg in messages:
        output = run_model(model, tokenizer, msg, max_new_tokens=5, verbose=False).lower().strip()
        outputs.append(output)

    return outputs

In [None]:
funny_outputs = run_llama(model, tokenizer, inputs, responses, funny_task)
funny_outputs

['response',
 'response',
 'response',
 'based',
 'response',
 'response',
 'based',
 'based',
 'based',
 'response',
 'after',
 'based',
 'based',
 'based',
 'response',
 'based',
 'response',
 'after',
 'i',
 'based',
 'based',
 'based',
 'based',
 'based',
 'response',
 'based',
 'response',
 'based',
 'based',
 'based',
 'based',
 'response',
 'response',
 'response',
 'based',
 'based',
 'after',
 'response',
 'based',
 'based',
 'response',
 'based',
 'after',
 'based',
 'response',
 'response',
 'based',
 'based',
 'after',
 'based',
 'response',
 'response']

In [None]:
ai_outputs = run_llama(model, tokenizer,  inputs, responses, ai_task)
ai_outputs

['the',
 'the',
 'based',
 'response',
 'based',
 'the',
 'response',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'response',
 'based',
 'i',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based',
 'the',
 'response',
 'based',
 'based',
 'based',
 'based',
 'based',
 'the',
 'based',
 'based',
 'based',
 'based',
 'response',
 'based',
 'based',
 'based',
 'based',
 'based',
 'based']

In [None]:
total_count_llama = 0
for i in range(n):
    funnyllama_outputs = evaluate_gpt(inputs, responses, funny_task)
    count_funny, problematic_funnyllama_idx = evaluateMetric(funnyllama_outputs, winningAnswerIdx)
    print("count correct: ",count_funny)
    total_count_llama += count_funny
    listProblematicOutputs(problematic_funnyllama_idx, funnyllama_outputs, winningAnswerIdx)
print(total_count_llama, total_count_llama / n)

In [None]:
total_aicount_llama = 0
for i in range(n):
    ai_llama_outputs = evaluate_gpt(inputs, responses, ai_task)
    count_aiSubmitters, problematic_ai_idx_llama = evaluateMetric(ai_llama_outputs, aiSubmitters)
    print("count correct: ",count_aiSubmitters)
    total_aicount_llama += count_aiSubmitters
    listProblematicOutputs(problematic_ai_idx_llama, ai_llama_outputs, count_aiSubmitters)
print(total_aicount_llama, total_aicount_llama / n)