In [1]:
# Generating one token at a time
# Step 1: Load a tokenizer and a model

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load pretrained model and tokenizer using Huggingface
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Feed a a partial sentence to the tokenizer and tokenize it
text = "I am the captain of my"
# Tokenize the text input and map tokens to token IDs from the model vocabulary and return the tensors in PyTourch (PT) format
inputs = tokenizer(text, return_tensors="pt")

# Retrieve the token IDs from the tokenized data dictionary
inputs["input_ids"]

# convert the input text to tokens (not token IDs but the tokens)
tokens = tokenizer.tokenize(text)
print(tokens) # Output: ['I', 'Ġam', 'Ġthe', 'Ġcaptain', 'Ġof', 'Ġmy'], the Ġ indicate the presense of white space
print(inputs)

embeddings = model.transformer.wte # embedding layer
dense_vector = embeddings(inputs["input_ids"]) # passing the token IDs to the embedding layer to convert each token to it respective dense vector that is 768 values that capture the
                                               # learned values about the token including context, syntax, semantic meaning, etc
ste = dense_vector[0,3] # fetching only the dense vector for the 4th token, 'Ġcaptain' in this case, the tensor shape would be [768] meaning 1 raw = 1 token with 768 values
# print(f"dense vector: {dense_vector[0]}")
# print(dense_vector.shape)
# print(dense_vector)

print(f"Single token embedding: {ste}")
print(f"ste shape: {ste.shape}")

  from .autonotebook import tqdm as notebook_tqdm


['I', 'Ġam', 'Ġthe', 'Ġcaptain', 'Ġof', 'Ġmy']
{'input_ids': tensor([[   40,   716,   262, 10654,   286,   616]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
Single token embedding: tensor([ 4.3597e-02, -1.3362e-01,  1.7140e-01,  1.1383e-02, -7.4586e-02,
        -5.1723e-02, -2.9620e-01, -5.3332e-02,  4.0742e-02, -1.2568e-01,
         1.3112e-01,  1.3011e-01,  2.3719e-02,  2.2207e-02,  1.0508e-01,
         3.2130e-02,  7.2995e-02, -1.0549e-02,  5.5878e-02,  1.2675e-01,
        -9.5042e-02, -1.6710e-01,  9.8741e-03,  5.8797e-02,  3.3012e-02,
         1.1869e-02,  8.5770e-02,  7.8617e-02,  5.4794e-02, -1.5835e-01,
        -2.0698e-02,  1.0220e-01, -1.8902e-02,  1.8424e-01, -7.7152e-02,
        -2.9987e-02, -3.1549e-01,  1.7471e-02,  5.3715e-02, -2.1831e-02,
        -4.0109e-02,  1.0515e-01,  2.6298e-02,  2.5054e-03,  1.6049e-01,
        -1.1536e-01,  1.0039e-01, -2.1206e-01,  1.7320e-01, -1.6942e-01,
         9.9444e-02, -1.7858e-01,  6.2710e-02, -2.1736e-02, -6.2229e-02,
        -1

In [2]:
# Examine the tokenizer by using pandas for visualization:

import pandas as pd

def view_tokenization(inputs):
    return pd.DataFrame(
        [(id, tokenizer.decode(id)) for id in inputs["input_ids"][0]], columns=["id", "token"],
    )
view_tokenization(inputs)

Unnamed: 0,id,token
0,tensor(40),I
1,tensor(716),am
2,tensor(262),the
3,tensor(10654),captain
4,tensor(286),of
5,tensor(616),my


In [3]:
# Step 3 Calculate the probability of the next token
# Calculate the probabilities for the next token for all possible choices. We show the
# top 5 choices and the corresponding words or subwords for these tokens.

with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)
    print(logits)
def show_next_token_choices(probabilities, top_n=10):
    return pd.DataFrame(
        [
            (id, tokenizer.decode(id), p.item())
            for id, p in enumerate(probabilities)
            if p.item()
        ],
        columns=["id", "token", "p"],
    ).sort_values("p", ascending=False)[:top_n]


show_next_token_choices(probabilities)

tensor([[-102.6145, -101.7570, -105.0443,  ..., -104.0649, -107.5337,
         -103.4872]])


Unnamed: 0,id,token,p
898,898,own,0.205202
1074,1074,team,0.135876
4074,4074,ship,0.066465
5462,5462,crew,0.038823
1641,1641,family,0.025252
1499,1499,country,0.019634
8244,8244,squad,0.017403
2151,2151,party,0.015524
1664,1664,company,0.01232
40733,40733,squadron,0.009413


In [4]:
# Obtain the token id for the most probable next token
next_token_id = torch.argmax(probabilities).item()

print(f"Next token id: {next_token_id}")
print(f"Next token: {tokenizer.decode(next_token_id)}")

Next token id: 898
Next token:  own


In [7]:
# We append the most likely token to the text.
text = text + tokenizer.decode(898)
text

'I am the captain of my own own own'

In [8]:
# Press ctrl + enter to run this cell again and again to see how the text is generated.

from IPython.display import Markdown, display

# Show the text
print(text)

# Convert to tokens
inputs = tokenizer(text, return_tensors="pt")

# Calculate the probabilities for the next token and show the top 5 choices
with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

display(Markdown("**Next token probabilities:**"))
display(show_next_token_choices(probabilities))

# Choose the most likely token id and add it to the text
next_token_id = torch.argmax(probabilities).item()
text = text + tokenizer.decode(next_token_id)
text

I am the captain of my own own own


**Next token probabilities:**

Unnamed: 0,id,token,p
4074,4074,ship,0.122465
1074,1074,team,0.061103
1664,1664,company,0.030685
5462,5462,crew,0.026207
2151,2151,party,0.02076
1641,1641,family,0.018129
3430,3430,club,0.015509
1499,1499,country,0.01232
11,11,",",0.012021
14893,14893,tribe,0.011046


'I am the captain of my own own own ship'

In [9]:
from IPython.display import Markdown, display

# Start with some text and tokenize it
text = "I am a software enginner who uses AI to"
inputs = tokenizer(text, return_tensors="pt")

# Use the `generate` method to generate lots of text
output = model.generate(**inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)

# Show the generated text
display(Markdown(tokenizer.decode(output[0])))

I am a software enginner who uses AI to help me make better decisions. I am also a software developer who uses AI to help me make better decisions.

I am a software enginner who uses AI to help me make better decisions. I am also a software developer who uses AI to help me make better decisions.

I am a software enginner who uses AI to help me make better decisions. I am also a software developer who uses AI to help me make