In [None]:
# Generating one token at a time
# Step 1: Load a tokenizer and a model

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load pretrained model and tokenizer using Huggingface
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Feed a a partial sentence to the tokenizer and tokenize it
text = "I am the captain of my"
# Tokenize the text input and map tokens to token IDs from the model vocabulary and return the tensors in PyTourch (PT) format
inputs = tokenizer(text, return_tensors="pt")

# Retrieve the token IDs from the tokenized data dictionary
inputs["input_ids"]

# convert the input text to tokens (not token IDs but the tokens)
tokens = tokenizer.tokenize(text)
print(tokens) # Output: ['I', 'Ġam', 'Ġthe', 'Ġcaptain', 'Ġof', 'Ġmy'], the Ġ indicate the presense of white space
print(inputs)

embeddings = model.transformer.wte # embedding layer
dense_vector = embeddings(inputs["input_ids"]) # passing the token IDs to the embedding layer to convert each token to it respective dense vector that is 768 values that capture the
                                               # learned values about the token including context, syntax, semantic meaning, etc
ste = dense_vector[0,3] # fetching only the dense vector for the 4th token, 'Ġcaptain' in this case, the tensor shape would be [768] meaning 1 raw = 1 token with 768 values
# print(f"dense vector: {dense_vector[0]}")
# print(dense_vector.shape)
# print(dense_vector)

print(f"Single token embedding: {ste}")
print(f"ste shape: {ste.shape}")

In [None]:
# Examine the tokenizer by using pandas for visualization:

import pandas as pd

def view_tokenization(inputs):
    return pd.DataFrame(
        [(id, tokenizer.decode(id)) for id in inputs["input_ids"][0]], columns=["id", "token"],
    )
view_tokenization(inputs)