In [2]:
import torch
import tiktoken

In [3]:
with open ("Amontillado.txt", "r") as f:
    raw_text = f.read()

raw_text[:50]

'The thousand injuries of Fortunato I had borne as '

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")


In [5]:
enc_text = tokenizer.encode(raw_text)

In [6]:
print(enc_text[:20])

[464, 7319, 6821, 286, 376, 1922, 5549, 314, 550, 28068, 355, 314, 1266, 714, 26, 475, 618, 339, 44716, 2402]


In [7]:
print(tokenizer.decode(enc_text[:20]))

The thousand injuries of Fortunato I had borne as I best could; but when he ventured upon


In [8]:
for i in range(1,10):
    print("INPUT:", tokenizer.decode(enc_text[:i]), "TARGET:", tokenizer.decode([enc_text[i]]))

INPUT: The TARGET:  thousand
INPUT: The thousand TARGET:  injuries
INPUT: The thousand injuries TARGET:  of
INPUT: The thousand injuries of TARGET:  F
INPUT: The thousand injuries of F TARGET: ortun
INPUT: The thousand injuries of Fortun TARGET: ato
INPUT: The thousand injuries of Fortunato TARGET:  I
INPUT: The thousand injuries of Fortunato I TARGET:  had
INPUT: The thousand injuries of Fortunato I had TARGET:  borne


## Data Set 

In [9]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



## Dataloader

In [10]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [11]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


Inputs:
 tensor([[  464,  7319,  6821,   286],
        [  376,  1922,  5549,   314],
        [  550, 28068,   355,   314],
        [ 1266,   714,    26,   475],
        [  618,   339, 44716,  2402],
        [13277,    11,   314, 19982],
        [15827,    13,   921,    11],
        [  508,   523,   880,   760]])

Targets:
 tensor([[ 7319,  6821,   286,   376],
        [ 1922,  5549,   314,   550],
        [28068,   355,   314,  1266],
        [  714,    26,   475,   618],
        [  339, 44716,  2402, 13277],
        [   11,   314, 19982, 15827],
        [   13,   921,    11,   508],
        [  523,   880,   760,   262]])


In [12]:
type(inputs)

torch.Tensor

In [13]:
#Gives dimensions of the tensor
inputs.shape

torch.Size([8, 4])

In [14]:
#Look at the 2nd row
inputs[1]

tensor([ 376, 1922, 5549,  314])

In [15]:
# Look at the first column, first element of each row
inputs[:,0]

tensor([  464,   376,   550,  1266,   618, 13277, 15827,   508])

In [16]:
# Look at the ID in the 2nd row and 3rd column
inputs[1,2]

tensor(5549)

In [17]:
#Print tensor row by row, converting to list so tokenizer can decode
for row in inputs:
    print(tokenizer.decode(row.tolist()))

The thousand injuries of
 Fortunato I
 had borne as I
 best could; but
 when he ventured upon
 insult, I vowed
 revenge. You,
 who so well know


## Example of Generating Embedding Vectors

In [18]:
#illustrative example of how to generate the vectors of length 3 for the 6 word vocabular
vocab_size = 6
output_dim = 3

In [19]:
#nn is a sub-module for nueral networks in torch which contains Embedding
#Embedding created embedding vectors of a certain length for a certain vocab size
embedding = torch.nn.Embedding(vocab_size, output_dim)

In [20]:
print(embedding.weight)

Parameter containing:
tensor([[ 0.3842, -0.8694, -1.5476],
        [-0.1058, -0.0901, -0.2716],
        [-0.6403, -0.3490,  0.6016],
        [-0.9512,  1.3040, -0.7873],
        [ 0.6833,  0.7670, -0.6327],
        [-0.6543, -0.4101,  0.9935]], requires_grad=True)


In [21]:
embedding.weight[0]

tensor([ 0.3842, -0.8694, -1.5476], grad_fn=<SelectBackward0>)

In [22]:
#get rid of reqires_grad bit of text
embedding.weight.data

tensor([[ 0.3842, -0.8694, -1.5476],
        [-0.1058, -0.0901, -0.2716],
        [-0.6403, -0.3490,  0.6016],
        [-0.9512,  1.3040, -0.7873],
        [ 0.6833,  0.7670, -0.6327],
        [-0.6543, -0.4101,  0.9935]])

In [23]:
# Or get rid of it using detach()
print(embedding.weight.detach())

tensor([[ 0.3842, -0.8694, -1.5476],
        [-0.1058, -0.0901, -0.2716],
        [-0.6403, -0.3490,  0.6016],
        [-0.9512,  1.3040, -0.7873],
        [ 0.6833,  0.7670, -0.6327],
        [-0.6543, -0.4101,  0.9935]])


In [24]:
#What it would really look like with the correct sizes
#embedding = torch.nn.Embedding(tokenizer.n_vocab, 256)

In [25]:
A = embedding.weight.data
A

tensor([[ 0.3842, -0.8694, -1.5476],
        [-0.1058, -0.0901, -0.2716],
        [-0.6403, -0.3490,  0.6016],
        [-0.9512,  1.3040, -0.7873],
        [ 0.6833,  0.7670, -0.6327],
        [-0.6543, -0.4101,  0.9935]])

In [26]:
#making tensors from scratch-- just put in the elements
x = torch.tensor([2.1,1.3])
y = torch.tensor([1.5,2.7])
print(x,y)

tensor([2.1000, 1.3000]) tensor([1.5000, 2.7000])


In [27]:
#computing the dot product if two tensors
torch.dot(x,y)

tensor(6.6600)

## Some More Practice Stuff on Attention Mechanisms

In [28]:
raw_text= "This is my dog Hazel"

In [29]:
tokenizer= tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print (enc_text)

[1212, 318, 616, 3290, 42805]


In [30]:
vocab_size = 4
output_dim = 8
inputs = torch.nn.Embedding(vocab_size, output_dim)
print(inputs.weight)

Parameter containing:
tensor([[-0.0855, -1.0564, -0.4305,  0.7069, -0.8617,  0.0992, -1.6274, -2.1438],
        [ 2.8522,  1.4012, -0.7497, -1.5911,  0.2721, -0.5609,  0.6640, -0.7666],
        [-0.7643,  0.3051, -0.6697,  0.3432,  0.5046,  0.9810, -0.2262,  1.1889],
        [ 0.2676, -0.0646, -0.2281, -0.4881, -1.2655,  1.9061,  0.0154,  0.0075]],
       requires_grad=True)


In [31]:
#This gets rid of the grad = True thing
inputs = inputs.weight.data
inputs

tensor([[-0.0855, -1.0564, -0.4305,  0.7069, -0.8617,  0.0992, -1.6274, -2.1438],
        [ 2.8522,  1.4012, -0.7497, -1.5911,  0.2721, -0.5609,  0.6640, -0.7666],
        [-0.7643,  0.3051, -0.6697,  0.3432,  0.5046,  0.9810, -0.2262,  1.1889],
        [ 0.2676, -0.0646, -0.2281, -0.4881, -1.2655,  1.9061,  0.0154,  0.0075]])

In [32]:
inputs.shape

torch.Size([4, 8])

In [33]:
for row in inputs:
    print(row.tolist())

[-0.08552072197198868, -1.0564051866531372, -0.4305046796798706, 0.7069157958030701, -0.8616750836372375, 0.09924129396677017, -1.6274216175079346, -2.1437814235687256]
[2.852158546447754, 1.4011563062667847, -0.749657928943634, -1.5911266803741455, 0.2721112370491028, -0.5608687996864319, 0.6639871001243591, -0.7666245102882385]
[-0.764258861541748, 0.30506432056427, -0.6697277426719666, 0.3432409465312958, 0.5045517683029175, 0.9810091257095337, -0.2261744886636734, 1.1888693571090698]
[0.2675808072090149, -0.06460964679718018, -0.22806188464164734, -0.48808732628822327, -1.265477180480957, 1.906104326248169, 0.01543920487165451, 0.007537121884524822]


In [34]:
#Example to do dot product
x = torch.Tensor([1.1, 2.3])
y = torch.Tensor([3.4,-2.1])

In [35]:
#Calculate the dot product... annoying by hand
1.1*3.4 + 2.3*(-2.1)

-1.0899999999999999

In [36]:
#Built in dot product method!
torch.dot(x,y)

tensor(-1.0900)

In [37]:
#The important one is called the "query"
query = inputs[2]
print(query)

tensor([-0.7643,  0.3051, -0.6697,  0.3432,  0.5046,  0.9810, -0.2262,  1.1889])


In [38]:
# gives the dot products of the 3rd embedding vector with all the other vectors (vector 3 x vector 1, vector 3 x vector 2 ...)
for i in range(len(inputs)):
    print(torch.dot(query, inputs[i]))

tensor(-2.2439)
tensor(-3.2709)
tensor(3.9250)
tensor(0.9979)


In [39]:
#Want the dot products (attention scores) in a tensor of their own
#torch.zeros creates a tensor of zeros so that you can fill them in

attention_scores_2 = torch.zeros(len(inputs))
for i in range(len(inputs)):
    attention_scores_2[i] = (torch.dot(query, inputs[i]))
print(attention_scores_2)

tensor([-2.2439, -3.2709,  3.9250,  0.9979])


In [40]:
# Normaling the attention scores using softmax function-- basically makes everything positive in a complicated way
#def softmax(x):
    #torch.exp(x, dim=0) / torch.exp(x).sum()

In [41]:
#use the real torch function!
# Make sure you add the dimension
attention_weights_2 = torch.softmax(attention_scores_2, dim = 0)
attention_weights_2

tensor([1.9816e-03, 7.0960e-04, 9.4662e-01, 5.0691e-02])

In [42]:
attention_weights_2.sum()

tensor(1.0000)

In [43]:
# Making the actual context vector ...
context_vector_2 = torch.zeros(query.shape)
for i in range(len(attention_weights_2)):
    context_vector_2 += attention_weights_2[i]*inputs[i]
context_vector_2

tensor([-0.7080,  0.2844, -0.6469,  0.3004,  0.4120,  1.0251, -0.2161,  1.1210])

In [48]:
#Get all attention scores (dot products) for all input vectors w each other via matrix multiplication (@) 
attention_scores = inputs @ inputs.T
#inputs x tranverse inputs
attention_scores

tensor([[ 9.8050, -2.2534, -2.2439,  1.0368],
        [-2.2534, 14.6089, -3.2709,  0.2113],
        [-2.2439, -3.2709,  3.9250,  0.9979],
        [ 1.0368,  0.2113,  0.9979,  5.6010]])

In [None]:
#Normalize the attention scores = attention weights
attention_weights = torch.softmax(attention_scores, dim = -1)
attention_weights

tensor([[9.9983e-01, 5.7946e-06, 5.8497e-06, 1.5558e-04],
        [4.7509e-08, 1.0000e+00, 1.7174e-08, 5.5870e-07],
        [1.9816e-03, 7.0960e-04, 9.4662e-01, 5.0691e-02],
        [1.0165e-02, 4.4520e-03, 9.7762e-03, 9.7561e-01]])

In [46]:
attention_weights[0].sum()

tensor(1.0000)

In [47]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[-8.5453e-02, -1.0562e+00, -4.3048e-01,  7.0671e-01, -8.6172e-01,
          9.9524e-02, -1.6271e+00, -2.1434e+00],
        [ 2.8522e+00,  1.4012e+00, -7.4966e-01, -1.5911e+00,  2.7211e-01,
         -5.6087e-01,  6.6399e-01, -7.6662e-01],
        [-7.0804e-01,  2.8441e-01, -6.4692e-01,  3.0045e-01,  4.1196e-01,
          1.0251e+00, -2.1607e-01,  1.1210e+00],
        [ 2.6541e-01, -6.4551e-02, -2.3676e-01, -4.7272e-01, -1.2372e+00,
          1.8677e+00, -7.3449e-04, -6.2277e-03]])