Loading and reading in text data

### Creating Tokens

In [1]:
with open ("the-verdict.txt",'r',encoding="utf-8") as f :
    raw_text=f.read()
print("Total number of characters:",len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)


['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:


result = re.split(r'([,.]|\s)', text)

print(result)



['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [4]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [5]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)


['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [6]:


preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])



['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [7]:
print(len(preprocessed))

4690


In [8]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)


1130


In [9]:
vocab = {token:integer for integer,token in enumerate(all_words)}


In [10]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break


('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [11]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed= re.split(r'(,.:;?!"()\']|--|\s)',text)
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text=re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [12]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        preprocessed=[
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text=re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text 

In [45]:
tokenizer= SimpleTokenizerV2(vocab)
text="Hello It's the last time I am going to see you"
ids=tokenizer.encode(text)
print(ids)

KeyError: '<|unk|>'

In [20]:
tokenizer.decode(ids)

"<|unk|> It' s the last time I am going to see you"

In [21]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}


## Bytepair Encoding simillar to Gpt-2

In [14]:
import tiktoken

In [15]:
tokenizer=tiktoken.get_encoding("gpt2")
text="Hi hello how are you? <|endoftext|> The world is a beautiful place"
integers=tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[17250, 23748, 703, 389, 345, 30, 220, 50256, 383, 995, 318, 257, 4950, 1295]


In [16]:
string=tokenizer.decode(integers)
print(string)

Hi hello how are you? <|endoftext|> The world is a beautiful place


### Create input output pairs

In [17]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))



5145


In [18]:
enc_sample = enc_text[50:]

In [19]:
context_size = 4
#length of the input
# model looks at the 4 words and then predict the next word

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [20]:
for  i in range(1,context_size+1):
    context=enc_sample[:i]
    desired=enc_sample[i]
    print(context,"---->",desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [21]:
for  i in range(1,context_size+1):
    context=enc_sample[:i]
    desired=enc_sample[i]
    print(tokenizer.decode(context),"---->",tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### Implementing a data loader

In [22]:
import torch 

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [23]:
from torch.utils.data import Dataset,DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[] #input chunk 
        self.target_ids=[] #output chunk 

        token_ids=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        assert len(token_ids)>max_length,"Number of token_ids must be atleast equal to max_length + 1"
        #sliding window 
        for i in range(0,len(token_ids)-max_length,stride): 
            input_chunk=token_ids[i:i+max_length]
            output_chunk=token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx] #mapping dataset map each input to corresponding output

In [24]:
def create_dataloader_v1(txt,batch_size=4,  #batch_size=4 no of elements in a training batch , stride=128 means the amount of movement of the window
            max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0): # drop last is done so that the last batch of the training set may be ignored as if the batch size< 4 causes problem num_workers is the number of cpu threads that can be run simultaneously
            tokenizer=tiktoken.get_encoding("gpt2") #tokenizer initiated
            dataset=GPTDatasetV1(txt,tokenizer,max_length,stride) #dataset made
            dataloader= DataLoader( #create dataloader
                dataset,
                batch_size=batch_size,
                shuffle=shuffle,
                drop_last=drop_last,   #this dataloaderV1 and then from getitems it gives out a input output
                num_workers=num_workers
            )
            return dataloader

             

testing the dataloader

In [25]:
with open("the-verdict.txt","r",encoding="utf_8") as f:
    raw_text=f.read()

In [26]:
dataloader=create_dataloader_v1(
    raw_text,batch_size=1,max_length=4,stride=1,shuffle=False
)
data_iter=iter(dataloader)
first_batch=next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [27]:
tokenizer.decode([367,2885,1464,1807])

' HAD always thought'

In [28]:
second_batch=next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [29]:
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=4,stride=4,shuffle=False)
data_iter=iter(dataloader)
inputs,targets=next(data_iter)
print("Inputs:\n",inputs)
print("Targets:\n",targets)
# each of the tensor contains 4 tokens since the max_length is also 4

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Token embeddings

In [30]:
input_ids = torch.tensor([2, 3, 5, 1]) #consider they are token ids 

In [31]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123) #generate random in any device
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) #embedding layer of 6*3

In [32]:
print(embedding_layer.weight) #embedding layer is an efficient way to implement one hot encoding

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [33]:
print(embedding_layer(torch.tensor([3]))) #convert token id 3 into a vector of 3 dimension and print it 

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [34]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### Positional Encoding

In [35]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [36]:


max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)



In [37]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)


Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [38]:
token_embedding=token_embedding_layer(inputs)
# print(token_embedding) # 50257 *256 50257 tokens and 256 dimensions
print(token_embedding.shape)

torch.Size([8, 4, 256])


In [39]:
context_length = max_length #another embedding layer because gpt 2 uses position embeddings
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
print(pos_embedding_layer.weight)

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       requires_grad=True)


In [40]:
pos_embedding=pos_embedding_layer(torch.arange(max_length))
print(pos_embedding.shape)
print(pos_embedding)

torch.Size([4, 256])
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       grad_fn=<EmbeddingBackward0>)


In [41]:
#input embedding will be posn embedding + token embedding 
input_embedding=pos_embedding+token_embedding
print(input_embedding.shape)
# print(input_embedding)

torch.Size([8, 4, 256])


### Attention

In [72]:
## we have completed preprocessing, tokenization , and token embeddings with positional embeddings
#simple self attention mechanism without trainable weights
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)


In [73]:
query = inputs[1]  # 2nd input token is the query

attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attn_scores_2)


tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [74]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
#normalising
print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)


In [75]:
#softmax normalization
# e^x /(e^x1+....e^xt)
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)# summation of eac row   

attn_weights_2_naive = softmax_naive(attn_scores_2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [76]:
#pytorch softmax because naive softmax can have underflow or overflow
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [77]:
query = inputs[1] # 2nd input token is the query

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)# final context vector

tensor([0.4419, 0.6515, 0.5683])


In [78]:
#this is for just 'journey' now lets do this for each text
attn_scores = torch.empty(len(inputs), len(inputs))

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [82]:
#the above operation works but is very complex and not scalable
#but according to linear algebra
attn_score = inputs @ inputs.T
print(attn_score)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [83]:
attn_weights=torch.softmax(attn_scores,dim=-1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [87]:
all_context_vecs= attn_weights @ inputs
print(all_context_vecs) #literally the same scaling operation as before(write down and check)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


#### if we can do this then why do we need trainable weights? 
because there is meaning in the current sentence, like here there is almost no relation between one and journey but consider there might be in this specific context, so to represent that we need to have trainable weights

##### eg: the cat sat on the mat because it was warm -> if warm is the query then mat needs to have a higher attention score 

In [94]:
# implementing self-attention with trainable weights


x_2 = inputs[1] # second input element
d_in = inputs.shape[1] # the input embedding size, d=3
d_out = 2 # the output embedding size, d=2

#in gpt like models the input and output dimensions are same

In [95]:
#initialize weight matrices 
torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

#when using weight matrices for model training 

In [90]:


query_2 = x_2 @ W_query # _2 because it's with respect to the 2nd input element
key_2 = x_2 @ W_key 
value_2 = x_2 @ W_value

print(query_2)



tensor([0.4306, 1.4551])


In [91]:
keys = inputs @ W_key 
values = inputs @ W_value

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

keys.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])


In [92]:


keys_2 = keys[1] # Python starts index at 0
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)



tensor(1.8524)
