In [210]:
import os
import sys
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
from math import ceil

In [2]:
base_model = "decapoda-research/llama-7b-hf"
llama_tokenizer = LlamaTokenizer.from_pretrained(base_model, padding_side="left")
llama = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
# alpaca = PeftModel.from_pretrained(
#     llama,
#     "tloen/alpaca-lora-7b",
#     torch_dtype=torch.float16,
# )
# unwind broken decapoda-research config
llama.config.pad_token_id = llama_tokenizer.pad_token_id = 0  # unk
llama.config.bos_token_id = 1
llama.config.eos_token_id = 2

llama.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    llama = torch.compile(llama)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [3]:
opt = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-6.7b",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
opt_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b", padding_side="left")

opt.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    opt = torch.compile(opt)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
prompts = ["Tell me about alpacas", "I really like cheese. How do I make a sandwich?"]
control = "Tell me about alpacas"

In [156]:
def masked_bos(a: torch.Tensor)->torch.Tensor:
    a[:, -1] = 0
    return torch.roll(a, shifts=1, dims=-1)

In [174]:
llama_tokens = llama_tokenizer(prompts, padding=True, return_tensors="pt").to('cuda')
llama_tokens

{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0, 24948,   592,
          1048,   394, 29886,   562,   294],
        [    0,   306,  2289,   763,   923,   968, 29889,  1128,   437,   306,
          1207,   263, 11982, 16416, 29973]], device='cuda:1'), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [6]:
llama_tokenizer.batch_decode(llama_tokens.input_ids)

['<unk><unk><unk><unk><unk><unk><unk><unk>Tell me about alpacas',
 '<unk>I really like cheese. How do I make a sandwich?']

In [8]:
llama_tokenizer.batch_decode(llama_control.input_ids)

['<unk>Tell me about alpacas']

In [175]:
with torch.no_grad():
    outputs = llama.generate(
        **llama_control,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk>Tell me about alpacas:222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222']


In [176]:
new_mask = torch.clone(llama_control.attention_mask)
new_mask[0,0] = 0
with torch.no_grad():
    outputs = llama.generate(
        input_ids=llama_control.input_ids,
        attention_mask=masked_bos(llama_control.attention_mask),
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk>Tell me about alpacas.\nA: Alpacas are members of the camelid family, which includes llamas, camels, and vicuñas. They are native to the Andes Mountains of South America, where they have lived for thousands of years. Alpacas are smaller than llamas, and they have a much finer, lighter fleece. Alpacas are raised for their fiber, which is used to make clothing, blankets, and other text']


In [177]:
with torch.no_grad():
    outputs = llama.generate(
        input_ids=llama_control.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk>Tell me about alpacas.\nA: Alpacas are members of the camelid family, which includes llamas, camels, and vicuñas. They are native to the Andes Mountains of South America, where they have lived for thousands of years. Alpacas are smaller than llamas, and they have a much finer, lighter fleece. Alpacas are raised for their fiber, which is used to make clothing, blankets, and other text']


In [12]:
with torch.no_grad():
    outputs = llama.generate(
        **llama_tokens,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk><unk><unk><unk><unk><unk><unk><unk>Tell me about alpacas.\nTell me about alpacas.\nAlpacas are members of the camelid family, which includes llamas, vicunas, guanacos, and camels. Alpacas are native to the Andes Mountains of Peru, Chile, and Bolivia. They are smaller than llamas, and have a more delicate appearance. Alpacas are raised for their fiber, which is used to make clothing, blankets, and', "<unk>I really like cheese. How do I make a sandwich?\nI'm going to make a sandwich.\nI'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sandwich. I'm going to make a sand"]


In [13]:
with torch.no_grad():
    outputs = llama.generate(
        input_ids=llama_tokens.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk><unk><unk><unk><unk><unk><unk><unk>Tell me about alpacas.\nI’m a big fan of alpacas. They’re very gentle, they’re very sweet, they’re very intelligent, and they’re very social. They’re very similar to llamas, but they’re a little bit smaller. They’re very similar to sheep, but they’re a little bit smaller. They’re very similar to goats, but they’re a little bit smaller. They’re very similar to c', '<unk>I really like cheese. How do I make a sandwich?\nI\'m not sure what you mean by "make a sandwich". You can make a sandwich with any kind of bread, cheese, and meat.\nWhat is the best cheese to use for a sandwich?\nI like cheddar cheese.\nWhat is the best cheese to use for a sandwich? I like cheddar cheese.\nWhat is the best cheese to use for a sandwich?\nI like cheddar che']


In [14]:
opt_tokens = opt_tokenizer(prompts, padding=True, return_tensors="pt").to('cuda')
opt_tokens

{'input_ids': tensor([[    1,     1,     1,     1,     1,     1,     2, 35438,   162,    59,
          1076, 21091,   281],
        [    2,   100,   269,   101,  7134,     4,  1336,   109,    38,   146,
            10, 15649,   116]], device='cuda:1'), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [15]:
opt_tokenizer.batch_decode(opt_tokens.input_ids)

['<pad><pad><pad><pad><pad><pad></s>Tell me about alpacas',
 '</s>I really like cheese. How do I make a sandwich?']

In [16]:
opt_control = opt_tokenizer(control, padding=True, return_tensors="pt").to('cuda')
opt_control

{'input_ids': tensor([[    2, 35438,   162,    59,  1076, 21091,   281]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [17]:
opt_tokenizer.batch_decode(opt_control.input_ids)

['</s>Tell me about alpacas']

In [18]:
with torch.no_grad():
    outputs = opt.generate(
        **opt_control,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(opt_tokenizer.batch_decode(outputs.sequences))

["</s>Tell me about alpacas.\nThey're like llamas, but with more hair.\nAnd they're more cuddly.</s>"]


In [19]:
with torch.no_grad():
    outputs = opt.generate(
        input_ids=opt_control.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(opt_tokenizer.batch_decode(outputs.sequences))

["</s>Tell me about alpacas.\nThey're like llamas, but with more hair.\nAnd they're more cuddly.</s>"]


In [20]:
new_mask = torch.clone(opt_control.attention_mask)
new_mask[0,0] = 0
with torch.no_grad():
    outputs = opt.generate(
        input_ids=opt_control.input_ids,
        attention_mask=new_mask,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(opt_tokenizer.batch_decode(outputs.sequences))

["</s>Tell me about alpacas.\nThey're like llamas, but they're smaller.\nThey're very gentle.\nThey're very smart.\nThey're very funny.\nThey're very, very funny.\nThey're very, very smart.\nThey're very, very gentle.\nThey're very, very funny.\nThey're very, very smart.\nThey're very, very gentle.\nThey're very, very funny.\nThey're very, very smart.\nThey're very,"]


In [21]:
with torch.no_grad():
    outputs = opt.generate(
        **opt_tokens,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(opt_tokenizer.batch_decode(outputs.sequences))

["<pad><pad><pad><pad><pad><pad></s>Tell me about alpacas.\nThey're like llamas but with more hair.\nAnd they're more cuddly.</s>", '</s>I really like cheese. How do I make a sandwich?\nI like cheese too. I like cheese on my sandwich.</s><pad><pad><pad><pad><pad><pad><pad><pad>']


In [22]:
with torch.no_grad():
    outputs = opt.generate(
        input_ids=opt_tokens.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(opt_tokenizer.batch_decode(outputs.sequences))

["<pad><pad><pad><pad><pad><pad></s>Tell me about alpacas.\nThey're like llamas but with more hair.\nAnd they're more cuddly.</s>", '</s>I really like cheese. How do I make a sandwich?\nI like cheese too. I like cheese on my sandwich.</s><pad><pad><pad><pad><pad><pad><pad><pad>']


In [23]:
llama.get_memory_footprint() / ((2**10)**3)

6.582530975341797

In [24]:
opt.get_memory_footprint() / ((2**10)**3)

6.402374267578125

In [25]:
alpaca = PeftModel.from_pretrained(
    llama,
    "tloen/alpaca-lora-7b",
    torch_dtype=torch.float16,
)

In [161]:
llama_tokens.attention_mask

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]], device='cuda:1')

In [162]:
masked_bos(llama_tokens.attention_mask)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')

In [163]:
with torch.no_grad():
    outputs = alpaca.generate(
        input_ids=llama_tokens.input_ids,
        attention_mask=masked_bos(llama_tokens.attention_mask),
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk><unk><unk><unk><unk><unk><unk><unk>Tell me about alpacas.\nA: Alpacas are members of the camelid family, which includes llamas, camels, and vicuñas. They are native to the Andes Mountains of South America, where they have lived for thousands of years. Alpacas are smaller than llamas, and they have a much finer, softer fleece. Alpacas are raised for their fiber, which is used to make clothing, blankets, and other text', '<unk>I really like cheese. How do I make a sandwich?\nI like cheese. I like bread. I like ham. I like lettuce. I like tomatoes. I like mustard. I like mayonnaise. I like pickles. I like ketchup. I like peppers. I like onions. I like olives. I like jalapenos. I like cucumbers. I like bell peppers. I like mushrooms. I like avocados. I like bananas. I']


In [164]:
with torch.no_grad():
    outputs = alpaca.generate(
        input_ids=llama_tokens.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

['<unk><unk><unk><unk><unk><unk><unk><unk>Tell me about alpacas.\nA: Alpacas are members of the camelid family, which includes llamas, camels, and vicuñas. They are native to the Andes Mountains of South America, where they have lived for thousands of years. Alpacas are smaller than llamas, and they have a much finer, softer fleece. Alpacas are raised for their fiber, which is used to make clothing, blankets, and other text', '<unk>I really like cheese. How do I make a sandwich?\nI like cheese. I like bread. I like ham. I like lettuce. I like tomatoes. I like mustard. I like mayonnaise. I like pickles. I like ketchup. I like peppers. I like onions. I like olives. I like jalapenos. I like cucumbers. I like bell peppers. I like mushrooms. I like avocados. I like bananas. I']


In [181]:
boolq =  "Ecuador at the FIFA World Cup -- The Ecuadorian national football team has appeared at three FIFA World Cups, the world's premier football tournament for national football teams. Ecuador's first participation in the World Cup was in 2002. Their best performance was in 2006, where they were eliminated in the Round of 16.\nquestion: has ecuador ever been in the world cup\nanswer: yes\n\nEcuador at the FIFA World Cup -- The Ecuadorian national football team has appeared at three FIFA World Cups, the world's premier football tournament for national football teams. Ecuador's first participation in the World Cup was in 2002. Their best performance was in 2006, where they were eliminated in the Round of 16.\nquestion: has ecuador ever been in the world cup\nanswer: yes\n\nEnterprise value -- Enterprise value (EV), total enterprise value (TEV), or firm value (FV) is an economic measure reflecting the market value of a business. It is a sum of claims by all claimants: creditors (secured and unsecured) and shareholders (preferred and common). Enterprise value is one of the fundamental metrics used in business valuation, financial modeling, accounting, portfolio analysis, and risk analysis.\nquestion: is enterprise value the same as firm value\nanswer"
boolq_tokens = llama_tokenizer(boolq, padding=True, return_tensors='pt').to('cuda')

In [183]:
with torch.no_grad():
    outputs = llama.generate(
        **boolq_tokens,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
print(llama_tokenizer.batch_decode(outputs.sequences))

["<unk>Ecuador at the FIFA World Cup -- The Ecuadorian national football team has appeared at three FIFA World Cups, the world's premier football tournament for national football teams. Ecuador's first participation in the World Cup was in 2002. Their best performance was in 2006, where they were eliminated in the Round of 16.\nquestion: has ecuador ever been in the world cup\nanswer: yes\n\nEcuador at the FIFA World Cup -- The Ecuadorian national football team has appeared at three FIFA World Cups, the world's premier football tournament for national football teams. Ecuador's first participation in the World Cup was in 2002. Their best performance was in 2006, where they were eliminated in the Round of 16.\nquestion: has ecuador ever been in the world cup\nanswer: yes\n\nEnterprise value -- Enterprise value (EV), total enterprise value (TEV), or firm value (FV) is an economic measure reflecting the market value of a business. It is a sum of claims by all claimants: creditors (secured 

In [200]:
gen_len()

NameError: name 'gen_len' is not defined

In [204]:
outputs.scores[0].squeeze().log_softmax(-1)[outputs.sequences[:, -100+1].item()].item()

-13.078125

In [189]:
print(llama_tokenizer.batch_decode(outputs.sequences[:, -100+1]))

['no']


In [191]:
outputs.sequences[:, -100+1]

tensor([694], device='cuda:1')

In [206]:
def batch_inference(model, tokenizer, prompts, batch_size, mask_bos):
    output_tokens = torch.empty(0, dtype=torch.int64).to('cuda:0')

    num_batches = ceil(len(prompts) / batch_size)

    for batch in range(num_batches):
        start = batch * batch_size
        end = min((batch + 1) * batch_size, len(prompts))

    gen_len=5
    # tokenize by batch to mitigate effect of long outliers
    tokens = tokenizer(prompts[start:end], padding=True, return_tensors="pt").to('cuda:0')
    attention_mask = masked_bos(tokens.attention_mask) if mask_bos else tokens.attention_mask
    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokens.input_ids,
            attention_mask=attention_mask,
            max_new_tokens=gen_len,
            temperature=0,
            return_dict_in_generate=True,
            output_scores=True,
        )
    output_tokens = torch.cat((output_tokens, outputs.sequences[:, -gen_len:]))

    output_text = tokenizer.batch_decode(output_tokens)
    return output_text

In [211]:
batch_inference(llama, llama_tokenizer, [boolq]*2, 2, False)

[': no\n\nEnter', ': no\n\nEnter']