In [None]:
# Generating one token at a time
# Step 1: Load a tokenizer and a model

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load pretrained model and tokenizer using Huggingface
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Feed a a partial sentence to the tokenizer and tokenize it
text = "I am the captain of my"
# Tokenize the text input and map tokens to token IDs from the model vocabulary and return the tensors in PyTourch (PT) format
inputs = tokenizer(text, return_tensors="pt")

# Retrieve the token IDs from the tokenized data dictionary
inputs["input_ids"]

# convert the input text to tokens (not token IDs but the tokens)
tokens = tokenizer.tokenize(text)
print(tokens) # Output: ['I', 'Ġam', 'Ġthe', 'Ġcaptain', 'Ġof', 'Ġmy'], the Ġ indicate the presense of white space
print(inputs)

embeddings = model.transformer.wte # embedding layer
dense_vector = embeddings(inputs["input_ids"]) # passing the token IDs to the embedding layer to convert each token to it respective dense vector that is 768 values that capture the
                                               # learned values about the token including context, syntax, semantic meaning, etc
ste = dense_vector[0,3] # fetching only the dense vector for the 4th token, 'Ġcaptain' in this case, the tensor shape would be [768] meaning 1 raw = 1 token with 768 values
# print(f"dense vector: {dense_vector[0]}")
# print(dense_vector.shape)
# print(dense_vector)

print(f"Single token embedding: {ste}")
print(f"ste shape: {ste.shape}")

['I', 'Ġam', 'Ġthe', 'Ġcaptain', 'Ġof', 'Ġmy']
{'input_ids': tensor([[   40,   716,   262, 10654,   286,   616]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
dense vector: tensor([[ 0.1474, -0.0959,  0.1430,  ...,  0.1030, -0.0625, -0.1131],
        [ 0.1596, -0.1249,  0.1148,  ...,  0.2558,  0.0196,  0.0145],
        [-0.0393,  0.0050,  0.0421,  ..., -0.0477,  0.0670, -0.0471],
        [ 0.0436, -0.1336,  0.1714,  ..., -0.0104,  0.0123, -0.0380],
        [-0.0572,  0.0183,  0.0333,  ..., -0.0689, -0.0931, -0.0714],
        [ 0.1578,  0.1091,  0.0737,  ...,  0.0506,  0.1057,  0.0281]],
       grad_fn=<SelectBackward0>)
Single token embedding: tensor([ 4.3597e-02, -1.3362e-01,  1.7140e-01,  1.1383e-02, -7.4586e-02,
        -5.1723e-02, -2.9620e-01, -5.3332e-02,  4.0742e-02, -1.2568e-01,
         1.3112e-01,  1.3011e-01,  2.3719e-02,  2.2207e-02,  1.0508e-01,
         3.2130e-02,  7.2995e-02, -1.0549e-02,  5.5878e-02,  1.2675e-01,
        -9.5042e-02, -1.6710e-01,  9.8741e-03,  5.87

In [None]:
# Examine the tokenizer by using pandas for visualization:

import pandas as pd

def view_tokenization()