<a href="https://colab.research.google.com/github/D-Sokol/denotarikon/blob/main/Sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==4.1.1



In [2]:
import torch
import numpy as np
import string
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
model = GPT2LMHeadModel.from_pretrained('gpt2').train(False).to(device)

In [4]:
all_tokens = [tokenizer.decode(i) for i in range(tokenizer.vocab_size)]

In [5]:
mask = torch.zeros(tokenizer.vocab_size, len(string.ascii_lowercase), dtype=bool)
for token, mask_row in zip(all_tokens, mask):
    if token.startswith(' '):
        if len(token) == 1 or token[1].lower() not in string.ascii_lowercase:
            continue
        mask_row[string.ascii_lowercase.index(token[1].lower())] = True
    elif token and token[0] not in string.ascii_uppercase:
        mask_row[:] = True

In [6]:
start_text = "The best possible example to demonstrate power of the project is"
start_tokens = tokenizer.encode(start_text)
start_tokens

[383, 1266, 1744, 1672, 284, 10176, 1176, 286, 262, 1628, 318]

In [7]:
# Parameter for nucleus sampling
p_threshold = 0.95
# Desired number of tokens in the result.
n_tokens = 111

In [8]:
tokens = start_tokens[:]
with torch.no_grad():
    result = model(torch.tensor(tokens, device=device)[None], past_key_values=None)
    next_logits, past = result['logits'][0, -1, :], result['past_key_values']
    for i in range(len(tokens), n_tokens):
        # TODO: consider other letters, not only T
        next_logits[~mask[:, 19]] = -np.inf
        next_probas = torch.softmax(next_logits, dim=-1).cpu()

        sorted_p, sorted_ix = torch.sort(next_probas, descending=True)
        cumulative_p = torch.cumsum(sorted_p, dim=-1)

        # Number of possible choices for next token, calculated as minimal n
        #  such that sum of probabilities of the first n tokens exceeds p_threshold
        n_tokens_next = np.argmax(cumulative_p.numpy() > p_threshold) + 1

        sorted_p = sorted_p[:n_tokens_next]
        sorted_p /= cumulative_p[n_tokens_next-1]
        ix_ix = np.random.choice(n_tokens_next, p=sorted_p.numpy())
        next_ix = sorted_ix[ix_ix]
        tokens.append(next_ix.item())

        result = model(next_ix[None], past_key_values=past)
        next_logits, past = result['logits'][0, :], result['past_key_values']

In [9]:
print(tokenizer.decode(tokens))

 The best possible example to demonstrate power of the project is to talk to the top ten teachers. They tell their teachers:

"talk to, tell the top ten teachers." That's the top ten teachers. They've talked to the top ten teachers, they've talked to them. They're talking to them. They're telling them they're the top ten teachers.

you're the top ten teacher, tell them, the top ten teachers.

top ten, tell them, the top ten teachers.

top ten, tell
