In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

torch_device = "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(torch_device)


In [2]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenizer.vocab_size

50257

In [3]:
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to(torch_device)

greedy_output = model.generate(**model_inputs, max_new_tokens=40, output_scores=True, return_dict_in_generate=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [4]:
from tqdm.notebook import tqdm

In [8]:
import numpy as np

In [9]:
help(np.random.choice)

Help on built-in function choice:

choice(...) method of numpy.random.mtrand.RandomState instance
    choice(a, size=None, replace=True, p=None)

    Generates a random sample from a given 1-D array

    .. versionadded:: 1.7.0

    .. note::
        New code should use the `~numpy.random.Generator.choice`
        method of a `~numpy.random.Generator` instance instead;
        please see the :ref:`random-quick-start`.

    Parameters
    ----------
    a : 1-D array-like or int
        If an ndarray, a random sample is generated from its elements.
        If an int, the random sample is generated as if it were ``np.arange(a)``
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  Default is None, in which case a
        single value is returned.
    replace : boolean, optional
        Whether the sample is with or without replacement. Default is True,
        meaning that a value of ``

In [170]:
def greedy_decode(generate_func, input_ids, max_new_tokens) -> str:
    for i in tqdm(range(max_new_tokens)):
        logits = generate_func(
            {
                'input_ids':input_ids, 
                'attention_mask':torch.ones_like(input_ids).to(torch_device),
                'pad_token_id':50256
            }
        )
        next_token = logits.argmax(axis=1)
        input_ids = torch.cat([input_ids, next_token.unsqueeze(dim=1)], axis=1)
    return tokenizer.batch_decode(input_ids)


def sample(probs, top_p):
    sorted_probs, sorted_indices = probs.sort(descending=True)
    cumsum = torch.cumsum(sorted_probs, dim=0)
    mask = cumsum > top_p
    if mask.sum() == len(mask):
        mask[0] = False
    sorted_probs[mask] = 0
    probs[sorted_indices] = sorted_probs
    probs = probs.cpu().detach().numpy()
    probs = probs / probs.sum()
    next_token = np.random.choice(len(probs), p=probs)
    return next_token
    # input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(torch_device)], axis=1)
    
    
def top_p_sampling(generate_func, input_ids, max_new_tokens, top_p) -> str:

    '''
    top_p = 0.95
    probs = [[0.1, 0.7, 0.2]]
    sorted_probs = [[0.7, 0.2, 0.1]]
    sorted_indices = [[1, 2, 0]]
    cumsum = [[0.7, 0.9, 1.0]]
    mask = cumsum > top_p
    mask = [[False, False, True]]
    sorted_probs[mask] = 0
    sorted_probs = [[0.7, 0.2, 0]]
    !!!XXX  probs = sorted_probs[sorted_indices] XXX!!!
    probs = [[0.0, 0.7, 0.2]]

    probs[sorted_indices] = sorted_probs
    '''
    
    for i in tqdm(range(max_new_tokens)):
        probs = torch.softmax(generate_func(
            {
                'input_ids':input_ids, 
                'attention_mask':torch.ones_like(input_ids).to(torch_device),
                'pad_token_id':50256
            }
        ), axis=1).squeeze()
        next_token = sample(probs, top_p)
        input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(torch_device)], axis=1)
    return tokenizer.batch_decode(input_ids)

In [6]:
greedy_decode(
    lambda model_inputs: model.generate(
        **model_inputs, 
        output_scores=True, 
        return_dict_in_generate=True, 
        max_new_tokens=1
    ).scores[0],
    input_ids=model_inputs.input_ids,
    max_new_tokens=40
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/40 [00:00<?, ?it/s]

["I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.\n\nI'm not sure"]

In [7]:
tokenizer.batch_decode(greedy_output.sequences)

["I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.\n\nI'm not sure"]

In [172]:
top_p_sampling(
    lambda model_inputs: model.generate(
        **model_inputs, 
        output_scores=True, 
        return_dict_in_generate=True, 
        max_new_tokens=1
    ).scores[0],
    input_ids=model_inputs.input_ids,
    max_new_tokens=40,
    top_p=0.92
)

  0%|          | 0/40 [00:00<?, ?it/s]

['I enjoy walking with my cute dog," said Hicks. "The crowds are huge.\n\n"When I see him, the sky is black with people everywhere. I go back and forth between the security guards and the man there."']

In [169]:
sample(torch.tensor([0.1, 0.7, 0.2]), 0.95)

tensor([0.7000, 0.2000, 0.1000])
tensor([1, 2, 0])
tensor([0.7000, 0.9000, 1.0000])
tensor([False, False,  True])
tensor([0.2000, 0.0000, 0.7000])


2

In [None]:
'''
    top_p = 0.95
    probs = [[0.1, 0.7, 0.2]]
    sorted_probs = [[0.7, 0.2, 0.1]]
    sorted_indices = [[1, 2, 0]]
    cumsum = [[0.7, 0.9, 1.0]]
    mask = cumsum > top_p
    mask = [[False, False, True]]
    sorted_probs[mask] = 0
    sorted_probs = [[0.7, 0.2, 0]]
    probs = sorted_probs[sorted_indices]
    probs = [[0.0, 0.7, 0.2]]
    '''