## Coding Assignment Two

In [31]:
import torch
import tiktoken

In [32]:
with open ("Amontillado.txt", "r") as f:
    raw_text = f.read()

raw_text[:50]

'The thousand injuries of Fortunato I had borne as '

In [33]:
tokenizer= tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print (enc_text)

[464, 7319, 6821, 286, 376, 1922, 5549, 314, 550, 28068, 355, 314, 1266, 714, 26, 475, 618, 339, 44716, 2402, 13277, 11, 314, 19982, 15827, 13, 921, 11, 508, 523, 880, 760, 262, 3450, 286, 616, 5848, 11, 481, 407, 11691, 11, 2158, 11, 326, 314, 2921, 10517, 590, 284, 257, 2372, 13, 1629, 4129, 314, 561, 307, 27968, 2004, 26, 428, 373, 257, 966, 41385, 10282, 960, 4360, 262, 845, 2730, 31366, 351, 543, 340, 373, 12939, 11, 662, 10341, 262, 2126, 286, 2526, 13, 314, 1276, 407, 691, 6878, 11, 475, 6878, 351, 37610, 13, 317, 2642, 318, 555, 445, 2790, 618, 40788, 9929, 1124, 663, 48586, 263, 13, 632, 318, 8603, 555, 445, 2790, 618, 262, 27968, 1362, 10143, 284, 787, 2241, 2936, 355, 884, 284, 683, 508, 468, 1760, 262, 2642, 13, 198, 198, 1026, 1276, 307, 7247, 11, 326, 6159, 416, 1573, 4249, 28637, 550, 314, 1813, 376, 1922, 5549, 2728, 284, 4719, 616, 922, 481, 13, 314, 3767, 11, 355, 373, 616, 28329, 11, 284, 8212, 287, 465, 1986, 11, 290, 339, 750, 407, 19973, 326, 616, 8212, 783, 373, 

In [34]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



In [35]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [36]:
# 7. Try different values 
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 464, 7319, 6821,  286]]), tensor([[7319, 6821,  286,  376]])]
[tensor([[7319, 6821,  286,  376]]), tensor([[6821,  286,  376, 1922]])]


In [37]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=2, stride=2, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  464,  7319],
        [ 6821,   286],
        [  376,  1922],
        [ 5549,   314],
        [  550, 28068],
        [  355,   314],
        [ 1266,   714],
        [   26,   475]])

Targets:
 tensor([[ 7319,  6821],
        [  286,   376],
        [ 1922,  5549],
        [  314,   550],
        [28068,   355],
        [  314,  1266],
        [  714,    26],
        [  475,   618]])


In [38]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  464,  7319,  6821,   286],
        [  376,  1922,  5549,   314],
        [  550, 28068,   355,   314],
        [ 1266,   714,    26,   475],
        [  618,   339, 44716,  2402],
        [13277,    11,   314, 19982],
        [15827,    13,   921,    11],
        [  508,   523,   880,   760]])

Targets:
 tensor([[ 7319,  6821,   286,   376],
        [ 1922,  5549,   314,   550],
        [28068,   355,   314,  1266],
        [  714,    26,   475,   618],
        [  339, 44716,  2402, 13277],
        [   11,   314, 19982, 15827],
        [   13,   921,    11,   508],
        [  523,   880,   760,   262]])


In [39]:
# 8. Use tokenizer to decode batch
for row in inputs:
    print(tokenizer.decode(row.tolist()))

The thousand injuries of
 Fortunato I
 had borne as I
 best could; but
 when he ventured upon
 insult, I vowed
 revenge. You,
 who so well know


In [40]:
vocab_size = 8
output_dim = 4
inputs = torch.nn.Embedding(vocab_size, output_dim)
print(inputs.weight)

Parameter containing:
tensor([[-1.9446, -1.6989, -0.5941,  0.1812],
        [ 1.4182, -1.1518, -0.2035, -1.0670],
        [-1.4343,  0.1791,  0.8436,  0.1703],
        [ 1.4848, -0.8768, -0.6003,  0.7026],
        [ 0.1889, -1.1759,  1.3024,  0.3229],
        [-0.8702, -2.3382,  0.3984,  0.5698],
        [ 0.4951, -0.3488,  0.6581, -1.6923],
        [ 1.3876,  0.2032, -0.5720, -0.8050]], requires_grad=True)


In [41]:
#This gets rid of the grad = True thing
inputs = inputs.weight.data
inputs

tensor([[-1.9446, -1.6989, -0.5941,  0.1812],
        [ 1.4182, -1.1518, -0.2035, -1.0670],
        [-1.4343,  0.1791,  0.8436,  0.1703],
        [ 1.4848, -0.8768, -0.6003,  0.7026],
        [ 0.1889, -1.1759,  1.3024,  0.3229],
        [-0.8702, -2.3382,  0.3984,  0.5698],
        [ 0.4951, -0.3488,  0.6581, -1.6923],
        [ 1.3876,  0.2032, -0.5720, -0.8050]])

In [42]:
#Get all attention scores for all input vectors via matrix multiplication (@) easier than by for loops
attention_scores = inputs @ inputs.T
attention_scores

tensor([[ 7.0535, -0.8733,  2.0146, -0.9138,  0.9154,  5.5312, -1.0678, -2.8496],
        [-0.8733,  4.5178, -2.5937,  2.4881,  1.0127,  0.7701,  2.7756,  2.7090],
        [ 2.0146, -2.5937,  2.8300, -2.6734,  0.6722,  1.2626, -0.5055, -2.5734],
        [-0.9138,  2.4881, -2.6734,  3.8273,  0.7566,  0.9192, -0.5431,  1.6598],
        [ 0.9154,  1.0127,  0.6722,  0.7566,  3.2189,  3.2881,  0.8141, -0.9818],
        [ 5.5312,  0.7701,  1.2626,  0.9192,  3.2881,  6.7079, -0.3174, -2.3692],
        [-1.0678,  2.7756, -0.5055, -0.5431,  0.8141, -0.3174,  3.6637,  1.6020],
        [-2.8496,  2.7090, -2.5734,  1.6598, -0.9818, -2.3692,  1.6020,  2.9418]])

In [43]:
#Normalize them 
attention_weights = torch.softmax(attention_scores, dim = -1)
attention_weights

tensor([[8.1440e-01, 2.9393e-04, 5.2780e-03, 2.8227e-04, 1.7582e-03, 1.7770e-01,
         2.4199e-04, 4.0733e-05],
        [2.9795e-03, 6.5387e-01, 5.3334e-04, 8.5904e-02, 1.9644e-02, 1.5413e-02,
         1.1451e-01, 1.0714e-01],
        [2.4376e-01, 2.4299e-03, 5.5090e-01, 2.2436e-03, 6.3674e-02, 1.1491e-01,
         1.9610e-02, 2.4797e-03],
        [5.8178e-03, 1.7467e-01, 1.0013e-03, 6.6650e-01, 3.0918e-02, 3.6378e-02,
         8.4290e-03, 7.6292e-02],
        [3.9170e-02, 4.3175e-02, 3.0716e-02, 3.3421e-02, 3.9207e-01, 4.2017e-01,
         3.5400e-02, 5.8754e-03],
        [2.2802e-01, 1.9510e-03, 3.1926e-03, 2.2648e-03, 2.4200e-02, 7.3963e-01,
         6.5762e-04, 8.4501e-05],
        [5.3274e-03, 2.4870e-01, 9.3476e-03, 9.0032e-03, 3.4980e-02, 1.1283e-02,
         6.0445e-01, 7.6909e-02],
        [1.2920e-03, 3.3524e-01, 1.7030e-03, 1.1741e-01, 8.3646e-03, 2.0887e-03,
         1.1081e-01, 4.2310e-01]])

In [44]:
attention_weights[0].sum()

tensor(1.0000)

In [45]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[-1.7446, -1.8009, -0.4063,  0.2497],
        [ 1.2439, -0.9107, -0.1401, -0.9016],
        [-1.3322, -0.6701,  0.4583,  0.1878],
        [ 1.3088, -0.9041, -0.4216,  0.2381],
        [-0.2753, -1.5947,  0.6718,  0.2911],
        [-1.0805, -2.1491,  0.1921,  0.4694],
        [ 0.7451, -0.5644,  0.3526, -1.3236],
        [ 1.2865, -0.4584, -0.2954, -0.7989]])