In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import random as rnd
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

#### Important notes in transformers
1. In the previous code 'bigram' every token was only using its information stored in 'Channel' variable to predict its next word, but wasn't write or accurate.

2. Here we want to make kind of connection or communication between tokens to make each token use 2 things to predict the next tokens (its info stored in channel) and (info of the previous tokens 'channels of the previous tokens').

3. 2 important notes First, each token should only have the access to the previous tokens not the future ones we should achieve that in our code, second to obtain all the information 'channels' of the previous tokens we calculate the average 'the mean' of those channels. ((Now this would like a feature vector includes the info of the current token and the previous channels mean.))

= Come later:
4. Although it's a good method to obtain info, BUT we missed very important feature which is the spatial arrangement of tokens and the order of the tokens

In [3]:
# Example of what we want to do
B,T,C = 4,8,2
x = torch.randn(B,T,C) # Here we create a random array with dimensions B,T,C
x.shape

xbow = torch.zeros((B,T,C)) # xbow refers to bag of words

for b in range(B):
    for t in range (T):
        xprev = x[b,:t+1] # Adding all the elements including the current element that I am standing on
        xbow[b,t]=torch.mean(xprev, 0) # Calculating the average of them

print(x[0])
print(xbow[0])
'''
The difference between the output of x and xbow:
    - x container: holds random values 
    - xbow: holds the average of each element and all its previous elements
that's why the first element is equal in both containers BUT starting from the next element in xbow it calculates the average of the first 2 elements in x container
'''

tensor([[ 1.1711, -0.3062],
        [-1.0709, -0.0416],
        [ 0.8544, -0.9220],
        [ 0.7695,  0.5506],
        [-0.5017, -1.4585],
        [-1.4429, -0.0798],
        [-0.9312, -1.9201],
        [-0.5831, -0.2524]])
tensor([[ 1.1711, -0.3062],
        [ 0.0501, -0.1739],
        [ 0.3182, -0.4233],
        [ 0.4310, -0.1798],
        [ 0.2445, -0.4355],
        [-0.0368, -0.3763],
        [-0.1645, -0.5968],
        [-0.2169, -0.5538]])


'\nThe difference between the output of x and xbow:\n    - x container: holds random values \n    - xbow: holds the average of each element and all its previous elements\n'

#### The mathematical trick of average
- If you multiplied any matrix by a (triangular ones averged matrix) the out will be the average of each element in the matrix "the thing that we want to do with channels or tokens"

In [20]:
# Preparing our (triangular averaged ones matrix)
a=torch.tril(torch.ones((3,3)))
a = a/torch.sum(a,1,keepdim=True)

# Creating our given matrix (the one that we want to get the average for)
b=torch.randint(0,10,(3,2)).float()

print(a)
print(b)
# Multiplication to get the average 
c = a@b
c


tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[5., 6.],
        [5., 1.],
        [5., 4.]])


tensor([[5.0000, 6.0000],
        [5.0000, 3.5000],
        [5.0000, 3.6667]])

In [21]:
# Now we want to use the same principle with the x matrix to create xbow matrix

# Preparing ones matrix
ones = torch.tril(torch.ones(T,T))
ones = ones/torch.sum(ones,1,keepdim=True)

xbow2 = ones@x

# Notice the 2 matrices are matched which means the method is working 
torch.allclose(xbow,xbow2)

True