#### Get 'The Verdict' Text.

In [1]:
# Get our sample document's contents as raw text
from typing import List

with open("data/the-verdict.txt", "r") as the_verdict_file:
  the_verdict_raw_text = the_verdict_file.read() # Read in text from 'The Verdict'

print(the_verdict_raw_text)

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?

Well!--even through th

In [2]:
len(the_verdict_raw_text)

20479

In [3]:
# Import regular expressions
import re
print("Imported regular expressions!")

Imported regular expressions!


#### Tokenize 'The Verdict', removing white space, punctuation, and treating individual words as tokens.

In [4]:
# Remove all punctuation and white space
# Regular expression supplied by the video
the_verdict_tokens = re.split(r'([,.:;?_!"()\']|--|\s)', the_verdict_raw_text)
print(the_verdict_tokens)
the_verdict_tokens = [token for token in the_verdict_tokens if token.strip()]
print(the_verdict_tokens)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', '', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', '', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', '', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', '', ' ', 'and', ' ', 'established', ' ', 'himself', ' ', 'in', ' ', 'a', ' ', 'villa', ' ', 'on', ' ', 'the', ' ', 'Riviera', '.', '', ' ', '', '(', 'Though', ' ', 'I', ' ', 'rather', ' ', 'thought', ' ', 'it', ' ', 'would', ' ', 'have', ' ', 'been', ' ', 'Rome', ' ', 'or', ' ', 'Florence', '.', '', ')', '', '\n', '', '\n', '', '"', 'The', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', '"', '', '--', 'that', ' ', 'was', ' ', '

#### Create a Vocabulary out the words in 'The Verdict'
#### Assign them each a Unique Token Id

In [5]:
all_words = sorted(set(the_verdict_tokens)) # Get only unique tokens, and sort them alphabetically
vocabulary_size = len(all_words)

# Go over every token and assign them a unique token id using enumerate
token_to_id_mapping = {token:token_id for token_id, token in enumerate(all_words)} 

#### Create your own version of the Version One Tokenizer
To Note:
* It can take in a sample text in \_\_init\_\_(...)
* It can then create a mapping dictionary from tokens to ids using the \_\_init\_\_(...) function
* For the function encode_text(...), it can use the token to ids mapping dictionary to take in a string containing a sentence, and then output a list of integer ids
* For the function decode_ids(...), it can use the token to ids mapping dictionary to take in a list of ids encoding a sentence, and then output the corresponding list of tokens. You can also specify a parameter to output the sentence as a string.

In [6]:
class TokenizerV1:
  """
  Version 1 of a class used to tokenize texts.

  Tokenization is the act of breaking down a text into significant individual units called 'tokens'.
  """
  def __init__(self, source_text: str):
    """
    Args:
      sample_text: str - Contains the source text we are going to use to create our vocabulary
    
    Creates:
      self.token_to_token_id_mapping: Dict[str, int] - Maps from token to token id
      self.token_id_to_token_mapping: Dict[int, str] - Maps from token id to token
    """
    # Split the text based on punctuation or whitespace.
    text_tokens_with_whitespace = re.split(r'([,.:;?_!"()\']|--|\s)', source_text)

    # Remove all white space tokens, leaving just word and punctuation as tokens.
    text_tokens = [token for token in text_tokens_with_whitespace if token.strip()]

    # Sort all unique tokens
    sorted_unique_tokens = sorted(set(text_tokens))
    
    self.token_to_token_id_mapping = {token:token_id for token_id, token in enumerate(sorted_unique_tokens)}
    self.token_id_to_token_mapping = {token_id:token for token,token_id in self.token_to_token_id_mapping.items()}
  
  def encode_tokens(self, list_of_tokens: List[str]) -> List[int]:
    """
    Returns a new list of token ids that correspond to the input list of tokens.

    Args:
      list_of_tokens: List[str] - List of tokens
    
    Returns:
      list_of_token_ids: List[int] - The corresponding list of token ids to the list of tokens
    """
    list_of_token_ids = []
    for token in list_of_tokens:
      if token not in self.token_to_token_id_mapping:
        list_of_token_ids.append(-1)
      else:
        list_of_token_ids.append(self.token_to_token_id_mapping[token])
    
    return list_of_token_ids

  def decode_token_ids(self, list_of_token_ids: List[int]) -> List[str]:
    """
    Returns a new list of tokens that correspond to the input list of token ids.

    Args:
      list_of_token_ids: List[int] - List of token ids
    
    Returns:
      list_of_tokens: List[str] - The corresponding list of tokens to the list of token ids
    """
    list_of_tokens = []
    for token_id in list_of_token_ids:
      if token_id not in self.token_id_to_token_mapping:
        list_of_tokens.append("<unknown>")
      else:
        list_of_tokens.append(self.token_id_to_token_mapping[token_id])
    
    return list_of_tokens
    

In [7]:
# Client Use Case for the TokenizerV1 Class

tokenizer = TokenizerV1(the_verdict_raw_text)
origin_text_tokenized = the_verdict_tokens[:17]
token_ids = tokenizer.encode_tokens(origin_text_tokenized[:17])
print(origin_text_tokenized)
print(token_ids)
token_ids_converted_to_tokens = tokenizer.decode_token_ids(token_ids)
print(token_ids_converted_to_tokens)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--']
[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6]
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--']


In [8]:
# Exercise: Create byte pair algorithm. This will occur later in the video
# Maybe let's not do this? It's not hard to grasp, but a decent amount of work.
# Let's stick to more LLM-centered stuff.

#### Use Byte Pair Encoding Through A Third Party Library
##### Through Byte Pair Encoding, if you're given an unknown word, the algorithm will be able to find and tokenize known words that may occur within the larger unknown word. All other unkown words will be broken down and tokenized into their individual characters.

For example, 'asfartestdfsdjumpsiwe' will get tokenized as ['as', 'far', 'test', 'd', 'f', 's', 'd', 'jump', 's', 'i', 'w', 'e']

In [9]:
import tiktoken

gpt2_tokenizer = tiktoken.get_encoding("gpt2") # gets the gpt2 tokenizer
#encoded = gpt2_tokenizer.encode("parrot")
#print(encoded)
#print(gpt2_tokenizer.decode(encoded))

#### Sliding window to provide LLMs data

When training LLMs, labels are given to you by the input text yourself. Simply cover some section of a sentence, and the next word following that sentence fragment is your label. 

However, in training LLMs, we cannot give the model all the tokens at once. How do we make this more efficient? With a sliding window that gives the LLM some chunk of tokens at a time.

In [10]:
# Use the gpt2 tokenizer to encode the raw text from 'The Verdict'

with open("data/the-verdict.txt", "r") as the_verdict_file_for_gpt2_tokenizer:
  the_verdict_raw_text_for_gpt2_tokenizer = the_verdict_file_for_gpt2_tokenizer.read() # Read in text from 'The Verdict'

# Encode 'The Verdict' raw text using gpt2 tokenizer
encoded_verdict_raw_text_from_gpt2_tokenizer = gpt2_tokenizer.encode(the_verdict_raw_text)
print(gpt2_tokenizer.n_vocab)

# Get all token ids, of 'The Verdict' text, starting from the 100th token.
truncated_verdict_token_ids_gpt2_tokenizer = encoded_verdict_raw_text_from_gpt2_tokenizer
print(len(truncated_verdict_token_ids_gpt2_tokenizer))

50257
5145


#### Use PyTorch to use their Dataloaders - very useful

In [11]:
import torch
print(torch.__version__)

# Number of tokens we want to consider when training to predict next tokens
context_size = 4 

2.7.1


#### Dataset Class to enable usage of Pytorch Dataloaders
##### Just copied it directly from the video because making this from scratch seems tedious

In [12]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1 (Dataset):
  def __init__(self, txt, tokenizer, max_length : int, stride: int):
    self.input_ids = []
    self.target_ids = []

    # Tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
    
    # Use a sliding window to chunk the book into overlapping sequences of max_length
    for i in range(0, len(token_ids) - max_length, stride):

      # Get input chunk
      input_chunk = token_ids[i:i + max_length]

      # Get target chunk, which is just the input chunk, but slid over one position forward.
      target_chunk = token_ids [i + 1: i + max_length + 1]

      # Append to appropriate instance fields
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))
    
  def __len__(self):
    return len(self.input_ids)
    
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]


"""
Type out the Dataloader class definition!
"""

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

  # Initialize the GPT-2 tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")
  
  # Create dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  
  # Create dataloader
  dataloader = DataLoader(
    dataset, 
    batch_size=batch_size, 
    shuffle=shuffle, 
    drop_last=drop_last,
    num_workers=num_workers
    )

  return dataloader

In [13]:
# Getting the data again.
with open("data/the-verdict.txt", "r") as the_verdict_file_for_gpt2_tokenizer:
  the_verdict_raw_text_for_gpt2_tokenizer = the_verdict_file_for_gpt2_tokenizer.read() # Read in text from 'The Verdict'


In [14]:
# Examples of using the dataloader
# We'll use the dataloader later. We just made it now just to have it for now.

dataloader = create_dataloader_v1(the_verdict_raw_text_for_gpt2_tokenizer, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iterator = iter(dataloader)
inputs, labels = next(data_iterator)
print("Inputs:",inputs)
print("Labels:",labels)

Inputs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Labels: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


#### Handling Embeddings
##### Embeddings are a concept that converts tokens into vectors of numbers that can be more easily processed by Neural Networks. Embedding Layers take in any word from your vocabulary and output a vector of numbers that represent that word. Words that are close in meaning should have similar embedding vectors. Words that are farther apart in meaning should have more different embedding vectors.

In [15]:
import torch
vocab_size = gpt2_tokenizer.n_vocab
output_dim = 3


# Create an embedding layer mapping from vocab size one hot encoding vectors to embedding vectors
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [16]:
input_ids = torch.Tensor([1, 2, 3]).to(dtype=torch.int32)

# When you call the embeddings layer with the current input ids for the text you
# have, you pull out the appropriate weight matrix rows (or embeddings) from the embeddings layer's
# weight matrix. We'll train the embeddings layer to optimize for the next word we predict.
print(embedding_layer.weight)
print(embedding_layer(input_ids))

Parameter containing:
tensor([[-0.5390, -1.5060, -0.1680],
        [ 0.1446,  0.5073,  0.9506],
        [ 1.4607,  0.5258, -0.4856],
        ...,
        [ 1.4241,  2.3658,  0.4780],
        [-1.0567,  2.0059, -0.2979],
        [-0.7015, -0.0934,  0.2091]], requires_grad=True)
tensor([[ 0.1446,  0.5073,  0.9506],
        [ 1.4607,  0.5258, -0.4856],
        [ 2.5969, -0.2028,  1.3335]], grad_fn=<EmbeddingBackward0>)


#### Position Encodings for Words

In [17]:
# Create an embedding layer to go from token to positional information.
vocab_size = gpt2_tokenizer.n_vocab # 50257 # Number of embeddings dimension
output_dim = 256 # More realistic embedding vector dimension than the 3 we had 
                 # for illustrative purposes.


# First dimension: How many total different tokens we can have an embedding vector for.
# Second dimension: How many elements exist in each embedding vector.
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.weight.shape)

torch.Size([50257, 256])


In [18]:
# Create a dataloader to load our data in batches.
max_length_of_each_example = 4
dataloader = create_dataloader_v1(the_verdict_raw_text_for_gpt2_tokenizer, 
                                  batch_size=8, 
                                  max_length=max_length_of_each_example,
                                  stride=max_length_of_each_example,
                                  shuffle=False)

data_iterator = iter(dataloader)
inputs, outputs =next(data_iterator)

#print(inputs)
#print(outputs)

# Shape is (8, 4, 256)
# We have 8 examples. Each example has 4 tokens.
# Each token has a 256 element embedding vector associated with it.

# To summarize, the embedding layer is called on each individual element of the inputs tensor.
# We get the embedding vectors associated with each token in our input, across all batches.
token_embeddings = token_embedding_layer(inputs)

print(token_embeddings[0,3])


tensor([ 0.4079,  1.3334, -0.7764,  0.6020, -0.2597, -0.9132, -2.1155,  0.1565,
        -1.2615,  1.4866,  0.5561,  0.4372, -0.0382, -1.5180, -0.8812,  0.0138,
        -0.1537, -0.9882,  1.4844,  0.4775, -0.4442,  0.4052,  0.0583, -0.8427,
         0.1220,  1.0656,  1.0311, -2.1912,  0.6313, -0.6810, -1.2608, -0.4965,
        -0.8982, -0.0246,  1.2030, -0.8742, -0.6771,  1.8891, -0.2641,  0.7401,
         1.0928,  1.1541, -0.0426, -1.6451,  0.4244, -1.4211, -0.1050,  0.3751,
         1.2831,  0.1727,  1.0059,  0.7913,  1.1024,  0.7694, -1.3544,  0.1974,
        -2.7802, -0.7463, -0.2951,  0.7043,  0.4681, -0.8142, -0.3993,  1.0891,
        -0.2619,  1.0819, -0.1875, -0.0961,  0.1767,  2.0458, -0.6308,  1.1335,
        -0.3522, -1.5920,  2.0637, -0.0775, -1.1981,  0.5535, -0.9129, -0.6835,
        -0.0444, -0.1215,  0.4145,  0.7970, -1.1134,  0.3288, -0.2297,  0.3765,
         0.0559,  0.6177,  2.0378, -0.5183, -1.0098,  0.8980, -1.5795,  0.2159,
        -1.4685,  0.4742,  1.1806, -0.02

In [19]:
# Create a second embedding layer to encode positional information of tokens.
# We add the positional embedding vectors to the token-meaning embedding vectors 
# to create our input embedding vectors.

# In practice, the context length, how many tokens GPT-2 can pay attention to at 
# once, is much bigger, on the scale of 1024 tokens or more. Modern GPT systems
# can pay attention to much bigger context lengths.
context_length = max_length_of_each_example

# First dimension: The length of each example, or how many tokens are in each training example
# Second dimension: The length of the embedding vector (same as the one for the token_embedding_layer)
positional_embedding_layer = torch.nn.Embedding(context_length, output_dim)
print(positional_embedding_layer.weight.shape)

torch.Size([4, 256])


In [20]:
# We obtain every row of the weights matrix of our positional embedding layer.
positional_embeddings = positional_embedding_layer(torch.arange(context_length))


#### Combine Token Embedding and Positional Embedding Information

In [21]:
# By adding together token and positional embedding info, we ensure that - if 
# we repeat the same token over and over - the final input embeddings matrix
# will be different for each token in a different position. Thus, the token embedding
# will be the same, encoding we have the same word, but the positional embedding will
# shift the final embedding result, indicating the word occurs at different positions
# in the context window.
input_embeddings = token_embeddings + positional_embeddings
print(input_embeddings)

tensor([[[ 1.4970,  0.3141,  1.8200,  ...,  1.0933,  0.4070, -1.6585],
         [ 0.6883, -2.0955,  0.7987,  ...,  0.2545,  0.3920, -0.6902],
         [ 1.7570, -0.1174,  1.5699,  ...,  1.0220, -3.0590,  0.0119],
         [ 0.0303,  2.5509, -0.4646,  ...,  0.4406,  0.6458,  1.2494]],

        [[ 2.3469, -1.2378,  0.6323,  ..., -0.1104, -0.2579, -0.6035],
         [ 0.2133,  0.0959, -1.7812,  ...,  1.1353, -0.0064, -1.0774],
         [ 1.4297, -0.1182, -1.5221,  ...,  0.7327,  1.0956, -0.1765],
         [-0.7327,  1.6162,  0.0551,  ..., -1.7444,  0.6004,  0.5153]],

        [[ 0.1460, -0.5993,  3.2053,  ...,  0.2799, -0.1827,  0.7737],
         [ 1.2244, -1.4739,  0.6926,  ..., -1.0876, -0.6567, -0.5738],
         [ 0.9751,  1.0016,  1.8611,  ...,  1.4372, -0.7812,  0.8420],
         [ 0.0985,  1.5736, -0.5514,  ..., -1.4153,  1.1628,  0.2084]],

        ...,

        [[ 1.3578, -0.6983,  1.4997,  ..., -1.2781,  0.9010, -0.9538],
         [ 0.7567, -2.1431,  0.2507,  ..., -1.5667,  2.48

## Input Pipeline Is Finished!
### We went through this process:
#### Take in full text.
#### Tokenize the text.
#### Get token ids for each token.
#### Get token embeddings for each token, and positional embeddings for the context length.
#### Add up the positional embedding vector to each of the token embedding vectors we got from our batched inputs.
## We have our input embedding vector after the addition!
