In [2]:
%load_ext autoreload
%autoreload 2
from typing import List
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm, trange
from data_utils import load_from_json, save_to_json
from dotenv import load_dotenv
import os
import re
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

In [4]:
def format_llama3_prompt(
    user_prompt,
    system_prompt="You are a skilled multi-step reasoner that answers questions.",
    words_in_mouth="",
):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{words_in_mouth}"""

In [5]:
def answer_question(model, tokenizer, question: str, words_in_mouth: str=''):
    """ Returns the model's answer and its output probability """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    prompt = format_llama3_prompt(question + ' Reason out loud step by step and then surround your final answer with <<>> like this: <<answer>>', words_in_mouth=words_in_mouth)
    
    tokens = tokenizer(prompt, return_tensors='pt')
    input_ids = tokens.input_ids.to(device)
    attention_mask = tokens.attention_mask.to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=1000, 
            temperature=1, 
            num_return_sequences=1, 
            attention_mask=attention_mask,
            return_dict_in_generate=True,
            output_scores=True
        )

    # Decode the generated sequence
    full_output = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
    answer = full_output.split('assistant\n\n')[1].strip()

    # Find where the actual response starts
    response_start = len(tokenizer.encode(prompt, return_tensors='pt')[0])

    # Calculate probability product only for the response tokens
    prob_product = 1.0
    output_token_count = 0
    for i, score in enumerate(output.scores):
        if i + response_start < len(output.sequences[0]):
            probs = torch.nn.functional.softmax(score, dim=-1)
            token_prob = probs[0, output.sequences[0][i + response_start]].item()
            prob_product *= token_prob
            output_token_count += 1

    return answer, prob_product, output_token_count

In [6]:
def extract_answer(text):
    pattern = r'<<(.*?)>>'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return None

In [46]:
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", token=HF_TOKEN
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:29<00:00,  7.43s/it]


In [47]:
question_data = load_from_json("multihop_questions.json")

In [49]:
item = new_questions[0]
item

{'question': 'Which space agency launched the telescope named after the scientist who formulated the theory of general relativity?',
 'checkpoint_answer': 'Einstein',
 'answer': 'NASA',
 'prefix': 'Here is reasoning and answer:\n\nFirst, I know that the scientist who formulated the theory of general relativity is Albert Einstein'}

In [51]:
answer_question(model, tokenizer, item['question'], "Here is reasoning and answer:\n\n" + item['prefix'] if 'prefix' in item else '')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


("Here is reasoning and answer:\n\nHere is reasoning and answer:\n\nFirst, I know that the scientist who formulated the theory of general relativity is Albert Einstein.\n\nSecond, I know that there is a space agency named after Albert Einstein, which is the European Space Agency's (ESA) Albert Einstein Telescope.\n\nThird, the European Space Agency's Albert Einstein Telescope is not a real telescope, so the correct answer cannot be the European Space Agency.\n\nFourth, I know that there is a space agency that is active in the field of space exploration, which is the National Aeronautics and Space Administration (NASA).\n\nFifth, I know that NASA has launched several space-based telescopes, such as Hubble Space Telescope, Kepler Space Telescope, and James Webb Space Telescope.\n\nSixth, I know that none of the previously mentioned NASA telescopes are named after Albert Einstein.\n\nSeventh, I know that NASA has launched a space-based telescope named after Albert Einstein, which is the J

In [67]:
def answer_questions(model, tokenizer, questions: List[dict], num_iterations: int=10):
    results = load_from_json("multihop_results.json")
    for _ in range(num_iterations):
        for item in tqdm(questions):
            response, prob, num_tokens = answer_question(model, tokenizer, item['question'], "Here is reasoning and answer:\n\n" + item['prefix'] if 'prefix' in item else '')
            answer = extract_answer(response)
            
            if item['question'] not in results:
                results[item['question']] = []
            
            result = {
                'response': response,
                'answer': answer,
                'correct': item['answer'] in answer if answer else False,
                'prob': prob,
                'num_tokens': num_tokens
            }
            if 'prefix' in item:
                result['prefix'] = item['prefix']
            results[item['question']].append(result)
            save_to_json(results, "multihop_results.json")

In [10]:
# answer_questions(model, tokenizer, question_data)

In [105]:
results = load_from_json("multihop_results.json")
keys = list(results.keys())

In [82]:
results[keys[0]][0]

{'response': 'Here is reasoning and answer:\n\nThe movie in question is likely "Titanic". \n\nJack Dawson, the main character, is played by Leonardo DiCaprio.\n\nIn the movie, Jack dies after his ship, the Titanic, sinks, and he sacrifices his life by allowing Rose to escape on a lifeboat, thus staying behind and drowning.\n\nTherefore, the director of the movie is James Cameron, who directed the 1997 film "Titanic" <<James Cameron>>.',
 'answer': 'James Cameron',
 'correct': True,
 'prob': 6.6429801122075506e-31,
 'num_tokens': 90}

In [83]:
for i, key in enumerate(keys):
    if i not in [2, 5, 6, 7, 8]: continue
    for item in results[key]:
        match i:
            case 2:
                item['successful_prefix'] = item['response'].split('Einstein')[0] + 'Einstein'
            case 5:
                item['successful_prefix'] = item['response'].split('California')[0] + 'California'
            case 6:
                item['successful_prefix'] = item['response'].split('Bell')[0] + 'Bell'
            case 7:
                item['successful_prefix'] = item['response'].split('Oxford')[0] + 'Oxford'
            case 8:
                item['successful_prefix'] = item['response'].split('Hemingway')[0] + 'Hemingway'

save_to_json(results, "_multihop_results.json")

In [84]:
new_questions = []
for i, key in enumerate(keys):
    if i not in [2, 5, 6, 7, 8]: continue
    for item in results[key]:
        question = [q for q in question_data if q['question'] == key][0]
        question['prefix'] = item['successful_prefix']
        new_questions.append(question)
len(new_questions)

2807

In [110]:
results[keys[2]]

[{'response': 'Here is reasoning and answer:\n\nFirst, I know that the scientist who formulated the theory of general relativity is Albert Einstein. \n\nNext, I know that there is a telescope named after Albert Einstein, and it is called the Hubble Space Telescope. However, the Hubble Space Telescope is not launched by a space agency that is directly named after Albert Einstein.\n\nThen, I recall that the space agency that launched the Hubble Space Telescope is NASA, and NASA is not directly named after Albert Einstein.\n\nHowever, I know that there is another space agency that is named after a scientist who made significant contributions to the field of relativity. The agency is called the European Space Agency, and it is named after Hermann Oberth, a German engineer and physicist who is considered one of the founders of modern astronautics. \n\nBut, Hermann Oberth is not the scientist who formulated the theory of general relativity, which is Albert Einstein.\n\nTherefore, I conclude 

In [112]:
len([r for r in results[keys[2]] if 'prefix' in r and r['prefix'].startswith('Here is reasoning and answer:\n\n')])

533

In [113]:
len([r for r in results[keys[2]] if 'prefix' in r and not r['prefix'].startswith('Here is reasoning and answer:\n\n')])

0

In [94]:
answer_questions(model, tokenizer, [q for q in question_data if q['question'] in [keys[2], keys[5], keys[6], keys[7], keys[8]]], num_iterations=10)

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 20%|██        | 1/5 [00:13<00:55, 13.91s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 2/5 [00:18<00:25,  8.41s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 3/5 [00:20<00:11,  5.66s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 4/5 [00:21<00:03,  3.56s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 5/5 [00:24<00:00,  4.83s/it]
  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 1/5 [00:09<00:36,  9.18s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 2/5 [00:12<00:16,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 3/5 [00:14<00:07,  3.86s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 4

In [98]:
print(tabulate([[i, 
                 question, 
                 sum([r['correct'] for r in results[question] if 'prefix' in r]) / 
                 len([r['correct'] for r in results[question] if 'prefix' in r]), 
                 sum([r['correct'] for r in results[question] if 'prefix' not in r]) /
                 len([r['correct'] for r in results[question] if 'prefix' not in r]),
                 [r['prob'] for r in results[question]][0]
                 # len(results[question])
                 ] 
      for i, question in enumerate(results) if i in [2, 5, 6, 7, 8]],
      headers = ['Index', 'Question', 'milestone', 'end_to_end', 'golden']))

  Index  Question                                                                                                                      milestone    end_to_end       golden
-------  --------------------------------------------------------------------------------------------------------------------------  -----------  ------------  -----------
      2  Which space agency launched the telescope named after the scientist who formulated the theory of general relativity?           0.403377          0.48  7.61307e-49
      5  What is the most populous city in the state where Google is headquartered?                                                     0.580038          0.58  8.27118e-09
      6  What is the country where the inventor of the telephone died?                                                                  0.757062          0.7   3.8323e-26
      7  In which century was the university founded where Tolkien was a professor?                                                     0.220

  Index  Question                                                                                                                      milestone    end_to_end       golden
-------  --------------------------------------------------------------------------------------------------------------------------  -----------  ------------  -----------
      2  Which space agency launched the telescope named after the scientist who formulated the theory of general relativity?           0.403377          0.48  7.61307e-49
      5  What is the most populous city in the state where Google is headquartered?                                                     0.560000          0.58  8.27118e-09
      6  What is the country where the inventor of the telephone died?                                                                  0.757062          0.76  3.8323e-26
      7  In which century was the university founded where Tolkien was a professor?                                                     0.220339          0.32  1.31297e-41
      8  In what year did the Nobel Prize-winning author of "The Old Man and the Sea" purchase the house where he wrote that novel?     0.361582          0.38  2.60653e-40

In [None]:
  Index  Question                                                                                                                      milestone    end_to_end       golden
-------  --------------------------------------------------------------------------------------------------------------------------  -----------  ------------  -----------
      2  Which space agency launched the telescope named after the scientist who formulated the theory of general relativity?           0.405354          0.48  7.61307e-49
      5  What is the most populous city in the state where Google is headquartered?                                                     0.577735          0.58  8.27118e-09
      6  What is the country where the inventor of the telephone died?                                                                  0.758157          0.7   3.8323e-26
      7  In which century was the university founded where Tolkien was a professor?                                                     0.220729          0.32  1.31297e-41
      8  In what year did the Nobel Prize-winning author of "The Old Man and the Sea" purchase the house where he wrote that novel?     0.364683          0.3   2.60653e-40


In [13]:
for i in [2, 5, 6, 7, 8]:
    print(keys[i])
    print([r['answer'] for r in results[keys[i]]])

Which space agency launched the telescope named after the scientist who formulated the theory of general relativity?
['NASA', 'NASA', 'NASA launched the Albert Einstein Telescope', 'ESA', 'ESA', 'ESA', 'NASA', 'NASA launched the Chandra X-ray Observatory, named after Albert Einstein', 'The European Space Agency (ESA) launched the telescope named after the scientist who formulated the theory of general relativity, Albert Einstein.', 'NASA', 'NASA', 'there is no space agency that launched a telescope named after Albert Einstein', 'NASA', 'European Space Agency (ESA)', 'NASA', 'NASA', 'ESA', 'Hence, the space agency that launched the telescope named after the scientist who formulated the theory of general relativity is the European Space Agency (ESA), but not yet, as the Albert Einstein Telescope is planned but not yet launched.', 'The European Space Agency (ESA)', 'Chandra X-ray Observatory', 'NASA', 'NASA launched the telescope named after the scientist who formulated the theory of gene