Step 1. Load a tokenizer and a model

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# To load a pretrained model and a tokenizer using HuggingFace, we only need two lines of code!
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# We create a partial sentence and tokenize it.
text = "Diya is Beautiful and"
inputs = tokenizer(text, return_tensors="pt")

# Show the tokens as numbers, i.e. "input_ids"
inputs["input_ids"]

tensor([[18683,  3972,   318, 23762,   290]])

Step 2. Examine the tokenization

In [28]:
# Show how the sentence is tokenized
import pandas as pd


def show_tokenization(inputs):
    return pd.DataFrame(
        [(id, tokenizer.decode(id)) for id in inputs["input_ids"][0]],
        columns=["id", "token"],
    )


show_tokenization(inputs)

Unnamed: 0,id,token
0,tensor(18683),Di
1,tensor(3972),ya
2,tensor(318),is
3,tensor(23762),Beautiful
4,tensor(290),and


Step 2. Calculate the probability of the next token

In [29]:
import torch

with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)


def show_next_token_choices(probabilities, top_n=5):
    return pd.DataFrame(
        [
            (id, tokenizer.decode(id), p.item())
            for id, p in enumerate(probabilities)
            if p.item()
        ],
        columns=["id", "token", "p"],
    ).sort_values("p", ascending=False)[:top_n]


show_next_token_choices(probabilities)

Unnamed: 0,id,token,p
23762,23762,Beautiful,0.146257
44383,44383,Lovely,0.071541
15335,15335,Sweet,0.039768
314,314,I,0.034655
921,921,You,0.027486


In [30]:
next_token_id = torch.argmax(probabilities).item()

print(f"Next token id: {next_token_id}")
print(f"Next token: {tokenizer.decode(next_token_id)}")

Next token id: 23762
Next token:  Beautiful


Step 3. Generate some more tokens

In [31]:
from IPython.display import Markdown, display

# Show the text
print(text)

# Convert to tokens
inputs = tokenizer(text, return_tensors="pt")

# Calculate the probabilities for the next token and show the top 5 choices
with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

display(Markdown("**Next token probabilities:**"))
display(show_next_token_choices(probabilities))

# Choose the most likely token id and add it to the text
next_token_id = torch.argmax(probabilities).item()
text = text + tokenizer.decode(next_token_id)

Diya is Beautiful and


**Next token probabilities:**

Unnamed: 0,id,token,p
23762,23762,Beautiful,0.146257
44383,44383,Lovely,0.071541
15335,15335,Sweet,0.039768
314,314,I,0.034655
921,921,You,0.027486


Step 4. Use the generate method

In [39]:
from IPython.display import Markdown, display

# Start with some text and tokenize it
text = "Genrate a text about the future of AI"
inputs = tokenizer(text, return_tensors="pt")

# Use the `generate` method to generate lots of text
output = model.generate(**inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)

# Show the generated text
display(Markdown(tokenizer.decode(output[0])))

Genrate a text about the future of AI and the future of science.

The Future of Science

The future of science is a complex topic. The most important question is: what is the future of science?

The answer is: it depends on what you mean by "future".

The future of science is a complex topic. The most important question is: what is the future of science?

The answer is: it depends on what you mean by "future