## Exercise One:

## Exercise one

Byte Pair Encoding of Unknown Words

In [2]:
import tiktoken

# Initialize the tokenizer (we'll use OpenAI's "cl100k_base" tokenizer)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Input text
text = "Akwirw ier"

# Encode the text to get token IDs
token_ids = tokenizer.encode(text)

# Decode the token IDs back to the original text
decoded_text = tokenizer.decode(token_ids)

# Display results
print("Token IDs:", token_ids)
print("Decoded Text:", decoded_text)


Token IDs: [32, 29700, 404, 86, 602, 261]
Decoded Text: Akwirw ier


## Exercise two

Data loaders with different strides and context sizes 


In [10]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the text into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader(txt, batch_size=4, max_length=256, stride=128):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader

# Sample input text (replace with actual text file content)
sample_text = "In the heart of the city stood the old library, a relic from a bygone era. " \
              "Its stone walls bore the marks of time, and ivy clung tightly to its facade."

# Create tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(sample_text)

# Define embedding layers
vocab_size = 50257  # GPT-2 vocabulary size
output_dim = 256
max_len = 4  # Context length

token_embedding_layer = torch.nn.Embedding(max_len, output_dim)
pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# Generate data loaders with different max_length and stride values
dataloader_2_2 = create_dataloader(sample_text, batch_size=1, max_length=2, stride=2)
dataloader_8_2 = create_dataloader(sample_text, batch_size=1, max_length=8, stride=2)

# Collect batch data for display
batches_2_2 = [(inp.tolist(), tgt.tolist()) for inp, tgt in dataloader_2_2]
batches_8_2 = [(inp.tolist(), tgt.tolist()) for inp, tgt in dataloader_8_2]

# Convert to DataFrame for visualization
df_2_2 = pd.DataFrame(batches_2_2, columns=["Input Tokens", "Target Tokens"])
df_8_2 = pd.DataFrame(batches_8_2, columns=["Input Tokens", "Target Tokens"])

# Print results
print("Batches (max_length=2, stride=2):")
print(df_2_2)
print("\nBatches (max_length=8, stride=2):")
print(df_8_2)



Batches (max_length=2, stride=2):
       Input Tokens    Target Tokens
0      [[818, 262]]    [[262, 2612]]
1     [[2612, 286]]     [[286, 262]]
2     [[262, 1748]]   [[1748, 6204]]
3     [[6204, 262]]    [[262, 1468]]
4    [[1468, 5888]]     [[5888, 11]]
5       [[11, 257]]   [[257, 26341]]
6    [[26341, 422]]     [[422, 257]]
7      [[257, 416]]   [[416, 21260]]
8   [[21260, 6980]]     [[6980, 13]]
9      [[13, 6363]]   [[6363, 7815]]
10   [[7815, 7714]]  [[7714, 18631]]
11   [[18631, 262]]    [[262, 8849]]
12    [[8849, 286]]     [[286, 640]]
13      [[640, 11]]      [[11, 290]]
14   [[290, 21628]]    [[21628, 88]]
15      [[88, 537]]    [[537, 2150]]
16  [[2150, 17707]]   [[17707, 284]]
17     [[284, 663]]   [[663, 43562]]

Batches (max_length=8, stride=2):
                                         Input Tokens  \
0       [[818, 262, 2612, 286, 262, 1748, 6204, 262]]   
1     [[2612, 286, 262, 1748, 6204, 262, 1468, 5888]]   
2       [[262, 1748, 6204, 262, 1468, 5888, 11, 257]]   
