In [2]:
import os
os.chdir("/work/pi_dhruveshpate_umass_edu/aamballa_umass_edu/SuperposedDecoding/")

In [3]:
%load_ext autoreload
%autoreload 2

import json
import os
import re
from datetime import datetime

import torch
from datasets import load_dataset
from tqdm import tqdm

from eval import *
from superposed.llama.metrics import *
from superposed.llama.generation import Llama
from superposed.llama.superposed_generation import SuperposedLlama
from superposed.llama.tokenizer import Tokenizer
from superposed.ngrams.ngram_models import make_models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup

In [4]:
nq = load_dataset("nq_open")["validation"]

Downloading readme:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

In [5]:
# Params
param_file = "params/p15_d3_mixed.json"
with open(param_file, "r") as f:
    params = json.load(f)
    print(f"Parameters: {params}")
alpha = params["alpha"]
temp = params["temp"]
n_drafts = params["n_drafts"]
prompt_len = params["prompt_len"]
n_token_sample = params["n_token_sample"]
i_weights = params["i_weights"]
i_length = params["i_length"]

Parameters: {'alpha': 0.54, 'temp': 0.06, 'n_drafts': 3, 'prompt_len': 15, 'n_token_sample': 9, 'n_token_consider': 32000, 'mixing_method': 'sample_new_weights_with_score', 'smoothing': 'geom', 'sample_tokens': 0, 'sample_beams': 0, 'i_weights': [0.01, 0.04, 0.15, 0.18, 0.12], 'i_length': [1, 2, 3, 4, 5]}


# Create Models

In [7]:
ngrams = make_models("ngrams/ckpts-400k", bigram=True, trigram=True, fourgram=False, fivegram=False, sixgram=False, sevengram=False)

Making bigram...
1310808
Making trigram...
335544408


In [8]:
sup_device = torch.device("cuda:0")
reg_device = torch.device("cuda:1")

In [11]:
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "10302"

In [12]:
# load superposed
weight_path = "7B/llama-2-7b"
token_path = "7B/"
sup_model = SuperposedLlama.build(ckpt_dir=weight_path, 
                                 tokenizer_path=f'{token_path}/tokenizer.model', 
                                 max_seq_len=1000, 
                                 max_batch_size=16,
                                 device=sup_device,
                                 model_parallel_size=1)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


  _C._set_default_tensor_type(t)


Loaded in 93.23 seconds
cuda:0


In [12]:
# load regular
reg_model = Llama.build(ckpt_dir=weight_path, 
                    tokenizer_path=f'{token_path}/tokenizer.model', 
                    max_seq_len=1000, 
                    max_batch_size=16,
                    device=reg_device, # reg_device,
                    model_parallel_size=1)

0
Loaded in 22.47 seconds


In [14]:
tokenizer = Tokenizer(f"{token_path}/tokenizer.model")

# Evaluation

In [15]:
model_types = ["greedy", "superposed", "regular"]
model_type = model_types[1]

In [16]:
def evaluate_nq(model_type, question, max_gen_len):
    question = "Answer these questions:\n\nQ: " + question + "?\nA:"
    text_len = len(question) # for truncating
    prompt_len = len(tokenizer.encode([question], True, False)[0]) # for model
    if model_type == "regular" or model_type == "greedy":
        if model_type == "regular":
            input = [question for _ in range(n_drafts)]
            print(input)
            sequences, _ = evaluate_nucleus_losses(data=input,
                                                   model=reg_model,
                                                   tokenizer=tokenizer,
                                                   prompt_len=prompt_len,
                                                   max_gen_len=max_gen_len,
                                                   temp=0.6,
                                                   bsz=8,
                                                   marker=False)
        else:
            sequences, _ = evaluate_nucleus_losses(data=[question],
                                       model=reg_model,
                                       tokenizer=tokenizer,
                                       prompt_len=prompt_len,
                                       max_gen_len=max_gen_len,
                                       temp=0,
                                       bsz=8,
                                       marker=False)
        n_pd, seq_len = sequences.shape
    elif model_type == "superposed":
        sequences, _, model_probs, ngram_probs = evaluate_mixed_losses(data=[question],
                                                   model=sup_model,
                                                   tokenizer=tokenizer,
                                                   prompt_len=prompt_len,
                                                   max_gen_len=max_gen_len,
                                                   alpha=alpha,
                                                   temp=temp,
                                                   n_drafts=n_drafts,
                                                   n_token_sample=n_token_sample,
                                                   smoothing="geom", # Use greedy
                                                   bsz=8,
                                                   i_weights=i_weights,
                                                   i_length=i_length,
                                                   ngrams=ngrams,
                                                   marker=False,
                                                   get_model_probs=True)
        n_p, n_d, seq_len = sequences.shape
    # Process results
    sequences = sequences.reshape(-1, seq_len).tolist()
    for d_idx in range(len(sequences)):
        draft = sequences[d_idx]
        if -1 in draft:
            draft = draft[:draft.index(-1)]
        sequences[d_idx] = draft
    decoded_seq = tokenizer.decode(sequences)
    answers = []
    for s in decoded_seq:
        answers.append(re.split("[,.\n]", s[text_len:].strip())[0])
    return answers, model_probs, ngram_probs
            

In [17]:
# Run evaluation
predictions = []
print(f"Precision from 1 to {n_drafts}")
for sample in tqdm(nq):
    # Adaptively determine max generation length
    longest = 0
    shortest = 1000
    for answer in sample["answer"]:
        tmp = tokenizer.encode([answer], False, False)[0]
        if len(tmp) > longest:
            longest = len(tmp)
        if len(tmp) < shortest:
            shortest = len(tmp)
    question = sample["question"]
    answer = evaluate_nq(model_type, question, max_gen_len=shortest+3)
    predictions.append({"question": question, "answer": answer})

Precision from 1 to 3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3610/3610 [15:22<00:00,  3.91it/s]


In [18]:
# Separate results into precisions
precisions = {}
for i in range(1, n_drafts+1):
    prec = str(i)
    responses = []
    for result in predictions:
        responses.append({"question": result["question"], "answer": result["answer"][:i]})
    precisions[prec] = responses

In [19]:
# Print some results
counter = 0
for k in predictions:
    if counter >= 10:
        break
    print(k)
    counter += 1
    print("================")

{'question': 'when was the last time anyone was on the moon', 'answer': (['2019', '2019-', '2019'], tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]), tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..

# Saving

In [54]:
# Save results
os.makedirs("../../nq/", exist_ok=True)
print(precisions.keys())
for prec in range(1, n_drafts+1):
    out_path = f"../nq/eval_{model_type}_{prec}_test.jsonl"
    with open(out_path, "w") as f:
        for obj in precisions[str(prec)]:    
            f.write(json.dumps(obj) + "\n")

dict_keys(['1', '2', '3', '4', '5'])
