In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
#%%
from dataclasses import dataclass, field
from typing import Optional
import torch
import more_itertools
import logging
import sys
import textwrap
import collections


import tqdm
from transformers import pipeline, HfArgumentParser
from datasets import load_from_disk
import gradio as gr
from contextlib import nullcontext, redirect_stdout

import os, sys; sys.path.append(os.path.join(os.path.dirname(__vsc_ipynb_file__), ".")) #todo
# import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), ".")) #todo
from question_answer_utils import extract_answer, extract_question_prompt, get_question_answer_to_chat_formatter
from utils import load_model_for_inference, setup_logging, subset_dataset

#%%
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
setup_logging()


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
logger.warning("Using dummy args")
# model_name_or_path = "/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/training/overfit_single_nocompl/gpt2"
# model_name_or_path = "/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/training/overfit_single_nocompl/gpt2_2ex"
model_name_or_path = "/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/training/overfit_single_nocompl/gpt2_withpeft"
dataset_name = "/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/datasets/alpha_geo_small_processed"
# dataset_test_name = "test"
dataset_test_name = "train" # for overfitting exp
filename_predictions_out = "/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/predictions/exp_small/gpt2_predictions.txt"
# max_predict_samples = 2
max_predict_samples = 1
dataset_text_field = "text"
max_new_tokens = 70



In [34]:
logger.info(f"Generating predictions, writing to file '{filename_predictions_out}'")

raw_datasets = load_from_disk(dataset_name)
dataset = raw_datasets[dataset_test_name]
dataset = subset_dataset(dataset, n_samples=max_predict_samples)

model, tokenizer = load_model_for_inference(model_name_or_path)

is_chat_model = tokenizer.chat_template is not None
if is_chat_model:
    logger.info("Detected chat model, formatting according to chat template")
    # assumes user-assistant roles
    prompt_extraction_function = get_question_answer_to_chat_formatter(tokenizer, text_column=None, add_generation_prompt=True)
else:
    prompt_extraction_function = extract_question_prompt
    
def extract_extra_cols(batch):
    return {
        "question_prompt": [prompt_extraction_function(item) for item in batch[dataset_text_field]],
        "answer_only": [extract_answer(item) for item in batch[dataset_text_field]],
    }
dataset = dataset.map(extract_extra_cols, batched=True)
logger.info(f"Example datapoint: {dataset[0]}")

# use_cache to avoid recomputing hidden states, see https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958
# max_new_tokens = 70
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, 
    num_return_sequences=2, num_beams=4, do_sample=True, max_new_tokens=max_new_tokens, use_cache=True,
    return_full_text=False, # answer only
    num_workers=2, 
    # batch_size=2 # triggers a cuda device-side error, maybe related to https://github.com/huggingface/transformers/issues/22546
)


05/02/2024 15:51:48 - INFO - __main__ - Generating predictions, writing to file '/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/predictions/exp_small/gpt2_predictions.txt'
05/02/2024 15:51:48 - INFO - utils - Loading model from '/home/mmordig/reinforcement/HumbleAttemptAtGeneralAI/runs/verbalization/training/overfit_single_nocompl/gpt2_withpeft/checkpoint-4000'
05/02/2024 15:51:49 - INFO - accelerate.utils.modeling - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
05/02/2024 15:51:50 - INFO - utils - Loaded model
05/02/2024 15:51:50 - INFO - __main__ - Example datapoint: {'text': '### Question: Points B, A, C are defined such that triangle ABC is an equilateral triangle. Define points D, F, & E such that E, D, and F is a right angle isosceles triangle with the right angle at D. Point G is defined such that G, A, E are three 

In [35]:
# from transformers import AutoTokenizer

# model_name_or_path = "gpt2"
# model_name_or_path = "meta-llama/Llama-2-7b-hf"
# # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, add_eos_token=True, pad_token="[PAD]")
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, add_eos_token=True)
# tokenizer.pad_token = "[PAD]"
# print(tokenizer.eos_token)
# print(tokenizer.pad_token)

# # ?tokenizer
# tokenizer.special_tokens_map

In [36]:
# question_prompt = dataset[0]["question_prompt"]
# text = dataset[0]["text"]
# tokenizer.decode(tokenizer(text)["input_ids"], include_special_tokens=True)[-20:]

In [37]:
# ?pipe

In [39]:
break_nicely = lambda x: "\u23CE\n".join(textwrap.wrap(x)) # symbol "⏎" for line breaks
# using a pipe/dataset is faster because GPU works in the background while writing to file
# with open(filename_predictions_out, 'w') as f, redirect_stdout(f):
# with nullcontext():
for (out, question_prompt, gt_answer) in tqdm.tqdm(zip(pipe(dataset["question_prompt"]), dataset["question_prompt"], dataset["answer_only"])):
    print("#"*80)
    print("Query: ")
    print(break_nicely(question_prompt))
    print("Expected answer: ")
    print(break_nicely(gt_answer))
    # strips whitespace because generated text has leading and trailing whitespace
    out_counted = collections.Counter([candidate["generated_text"].strip() for candidate in out])
    gt_answer = gt_answer.strip()
    print(f"Number of candidates that are equal to expected: {out_counted.get(gt_answer, 0)}")
    print(f"Number of candidates that begin with expected:", sum(out_counted[key] for key in out_counted if key.startswith(gt_answer)))
    # for (i, candidate) in enumerate(out):
    #     candidate_text = candidate["generated_text"]
    for (i, (candidate_text, count)) in enumerate(out_counted.items()):
        # logger.info(f"Generated text: {candidate_text}")
        # answer = extract_answer(candidate_text)
        answer = candidate_text
        print("#"*20 + f" Candidate {i+1} (appears {count} times) " + "#"*20)
        extra = ""
        # not perfect because tokenizing with question_prompt may lead to different tokenization
        if len(tokenizer(answer)["input_ids"]) == max_new_tokens:
            extra = " <MAX token length exceeded>"
        print(break_nicely(answer) + extra)
    # sys.stdout.flush()
        

1it [00:00, 440.86it/s]

################################################################################
Query: 
### Question: Points B, A, C are defined such that triangle ABC is an⏎
equilateral triangle. Define points D, F, & E such that E, D, and F is⏎
a right angle isosceles triangle with the right angle at D. Point G is⏎
defined such that G, A, E are three consecutive vertices of a square.⏎
Circle centered at C with radius CE intersects circle centered at G⏎
with radius GE at H and E. Points I and J are defined such that line⏎
IC and line JC are the two tangents to circle centered at B with⏎
radius BF at point I and J respectively.. Define point K such that⏎
line DI and line AG are parallel. line JK perpendicular to line DI.⏎
line EK perpendicular to line AG. line JK meets line EK at the point⏎
K. ### Answer:
Expected answer: 
A B C = ieq_triangle A B C; D E F = risos D E F; G = psquare G A E; H⏎
= intersection_cc H C G E; I J = tangent I J C B F; K =⏎
intersection_tt K J D I E A G
Number of candidates t




In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


'<|endoftext|>'

In [None]:
print(list(out_counted.keys())[0])
print(' ' + gt_answer)

In [None]:
list(out_counted.keys())[0] == (' ' + gt_answer)
def show_diff(s1, s2):
    for (c1, c2) in zip(s1, s2):
        if c1 != c2:
            print(c1, c2)
show_diff(list(out_counted.keys())[0], (' ' + gt_answer))

s1 = list(out_counted.keys())[0]
s2 = (' ' + gt_answer)
s1[len(s2):]

In [None]:
# dataset[0]["text"]
question_prompt = "### Question: Points B, A, C are defined such that triangle ABC is an equilateral triangle. Define points D, F, & E such that E, D, and F is a right angle isosceles triangle with the right angle at D. Point G is defined such that G, A, E are three consecutive vertices of a square. Circle centered at C with radius CE intersects circle centered at G with radius GE at H and E. Points I and J are defined such that line IC and line JC are the two tangents to circle centered at B with radius BF at point I and J respectively.. Define point K such that line DI and line AG are parallel. line JK perpendicular to line DI. line EK perpendicular to line AG. line JK meets line EK at the point K. ### Answer:"
print(tokenizer(question_prompt + " ")["input_ids"][-10:] + tokenizer("answer")["input_ids"])
print(tokenizer(question_prompt + " answer")["input_ids"][-10:])

In [None]:
dataset[dataset_text_field]

In [None]:
print(tokenizer("Hello answer")["input_ids"])
print(tokenizer("Hello answer ")["input_ids"])

In [None]:
# print(model.generate(**tokenizer("Hello answer", return_tensors="pt").to("cuda")))
print(tokenizer.decode(model.generate(**tokenizer("How are", return_tensors="pt").to("cuda"), do_sample=False, max_new_tokens=20)[0]))
print(tokenizer.decode(model.generate(**tokenizer("How are ", return_tensors="pt").to("cuda"), do_sample=False, max_new_tokens=20)[0]))