In [1]:
import torch
import math
import json

In [2]:
torch.set_default_dtype(torch.float32)
if torch.cuda.is_available():
     torch.set_default_device(0)
     print("Running on the GPU")
else:
     print("Running on the CPU")

Running on the GPU


In [3]:
with open("bee20script.txt", "r") as file:
    data = file.read()
    data = data.replace('\n', ' ')
    
words = set(data.split(" "))
word_dict = {}
unique_words = len(words)
for i, word in enumerate(words):
    one_hot = torch.zeros(unique_words)
    one_hot[i] = 1
    word_dict[word] = one_hot


In [4]:
one_hot_encoded = []
for word in data.split(" "):
    one_hot_encoded.append(word_dict[word])

In [5]:
token_count = len(word_dict)
token_count

2924

In [6]:
d_model = 512
h = 4
n = 6
d_k = int(d_model / h)
d_v = d_k
attention_scaling = 1/(math.sqrt(d_k))
d_f_f = 4*d_model

In [7]:
input_ = torch.stack(one_hot_encoded[0:5])
input_.size()

torch.Size([5, 2924])

In [8]:
input_ = torch.stack(one_hot_encoded[0:5])
W_E = torch.randn(token_count, d_model, requires_grad=True)
embedding = input_ @ W_E 
embedding

tensor([[-1.5349, -0.1775, -2.5538,  ...,  0.9493,  0.5778,  2.0673],
        [ 1.3157, -1.3176,  1.0956,  ...,  0.4688,  0.1107, -0.2173],
        [ 0.3226,  0.0986,  0.8342,  ...,  1.0390, -0.9386,  1.0450],
        [ 1.5664, -0.7654, -0.0683,  ...,  0.2312, -0.7372, -0.3197],
        [-0.0464,  0.7819,  0.5740,  ..., -0.5508, -1.5316,  1.0995]],
       device='cuda:0', grad_fn=<MmBackward0>)

In [9]:
embedding.size()

torch.Size([5, 512])

In [10]:
def positional_encoding(E):
    num_tokens = E.size(0)
    encoding = torch.zeros(num_tokens, d_model)
    for pos in range(num_tokens):
        for i in range(0,d_model,2):
            encoding[pos, i] = math.sin(pos/(10000 ** ((2 * i) / d_model)))
            encoding[pos, i + 1] = math.cos(pos/(10000 ** ((2 * i) / d_model)))
    return encoding
positional_encoding(embedding)
    

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.0196e-01,  ...,  1.0000e+00,
          1.0746e-08,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.5814e-01,  ...,  1.0000e+00,
          2.1492e-08,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  3.4278e-01,  ...,  1.0000e+00,
          3.2238e-08,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01, -5.4861e-01,  ...,  1.0000e+00,
          4.2984e-08,  1.0000e+00]], device='cuda:0')

In [11]:
embedding += positional_encoding(embedding)

In [12]:
W_O = torch.randn(h*d_v, d_model, requires_grad=True)
#tuples with weights in order Q, K, V for each head
head_weights = []
for i in range(h):
    W_Q = torch.randn(d_model, d_k, requires_grad=True)
    W_K = torch.randn(d_model, d_k, requires_grad=True)
    W_V = torch.randn(d_model, d_v, requires_grad=True)
    head_weights.append((W_Q, W_K, W_V))

In [13]:
def attention_mask(input_):
    mask = torch.tril(input_, diagonal=0)
    return mask.masked_fill(mask == 0, float('-inf'))

In [14]:
def attention(Q, K, V):
    y_1 = Q @ K.t()
    
    y_2 = attention_scaling * y_1
    
    y_3 = attention_mask(y_2)
    
    max_y_3 = torch.max(y_3, 1, keepdim=True)[0]
    exp_softmax = torch.exp(y_3-max_y_3)
    sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
    y_4 = exp_softmax/sum_softmax

    y_5 = y_4 @ V
    
    return y_5


In [15]:
def multi_head_attention(E):
    heads = []
    for weights in head_weights:
        Q_W = weights[0]
        K_W = weights[1]
        V_W = weights[2]

        Q = E @ Q_W
        K = E @ K_W
        V = E @ V_W
        heads.append(attention(Q, K, V))
    return torch.cat(heads, dim=1) @ W_O

In [16]:
rms_norm = torch.nn.RMSNorm(d_model)
def add_and_norm(E, transformed_E):
    add = E + transformed_E
    return rms_norm(add)

In [17]:
add_and_norm(embedding, multi_head_attention(embedding))

tensor([[-0.8081,  0.2747,  1.3606,  ..., -1.4847, -1.5409, -0.8398],
        [-1.4332,  0.9373,  0.4449,  ..., -0.4562, -1.4409, -3.1927],
        [-1.5927,  0.7038,  1.7941,  ..., -1.4491, -0.9935, -1.5444],
        [-0.3765, -0.7241,  0.9261,  ..., -1.7439, -1.3674, -1.8242],
        [-0.1677,  0.4602,  0.5731,  ..., -0.1087, -2.5208, -2.2324]],
       device='cuda:0', grad_fn=<MulBackward0>)

In [18]:
add_and_norm(embedding, multi_head_attention(embedding)).size()

torch.Size([5, 512])

In [19]:
def feed_foward(input_, W_1, W_2, b_1, b_2):
    linear_1 = input_ @ W_1 + b_1
    relu = torch.max(torch.zeros(linear_1.size()), linear_1)
    linear_2 = relu @ W_2 + b_2
    return linear_2



In [20]:
class TransformerBlock():    
    def __init__(self, d_model, d_k, d_v, d_f_f, h):
        self.attention_scaling = 1/(math.sqrt(d_k))
        self.W_O = torch.randn(h*d_v, d_model, requires_grad=True)
        #tuples with weights in order Q, K, V for each head
        self.head_weights = []
        for i in range(h):
            W_Q = torch.randn(d_model, d_k, requires_grad=True)
            W_K = torch.randn(d_model, d_k, requires_grad=True)
            W_V = torch.randn(d_model, d_v, requires_grad=True)
            self.head_weights.append((W_Q, W_K, W_V))
        self.rms_norm = torch.nn.RMSNorm(d_model)
        self.W_1 = torch.randn(d_model, d_f_f, requires_grad=True)
        self.W_2 = torch.randn(d_f_f, d_model, requires_grad=True)
        self.b_1 = torch.randn(1, d_f_f, requires_grad=True)
        self.b_2 = torch.randn(1, d_model, requires_grad=True)

        self.rms_norm = torch.nn.RMSNorm(d_model)


    def multi_head_attention(self, E):
        heads = []
        for weights in self.head_weights:
            Q_W = weights[0]
            K_W = weights[1]
            V_W = weights[2]

            Q = E @ Q_W
            K = E @ K_W
            V = E @ V_W
            heads.append(self.attention(Q, K, V))
        return torch.cat(heads, dim=1) @ self.W_O
    def attention(self, Q, K, V):
        y_1 = Q @ K.t()
        
        y_2 = self.attention_scaling * y_1
        
        y_3 = self.attention_mask(y_2)
        
        max_y_3 = torch.max(y_3, 1, keepdim=True)[0]
        exp_softmax = torch.exp(y_3-max_y_3)
        sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
        y_4 = exp_softmax/sum_softmax

        y_5 = y_4 @ V
        
        return y_5
    def feed_foward(self, input_):
        linear_1 = input_ @ self.W_1 + self.b_1
        relu = torch.max(torch.zeros(linear_1.size()), linear_1)
        linear_2 = relu @ self.W_2 + self.b_2
        return linear_2
    def attention_mask(self, input_):
        mask = torch.tril(input_, diagonal=0)
        return mask.masked_fill(mask == 0, float('-inf'))
    def add_and_norm(self, E, transformed_E):
        add = E + transformed_E
        return self.rms_norm(add)
    def foward(self, E):
        transformed_E = self.multi_head_attention(E)
        normed_tran_E = self.add_and_norm(E, transformed_E)
        feed_foward_E = self.feed_foward(normed_tran_E)
        output = self.add_and_norm(normed_tran_E, feed_foward_E)
        return output
    def step(learning_rate):
        #redo with adam
        for weights in self.head_weights:
            Q_W = weights[0]
            K_W = weights[1]
            V_W = weights[2]

            Q_W.data -= learning_rate * Q_W.grad
            K_W.data -= learning_rate * K_W.grad
            V_W.data -= learning_rate * V_W.grad
            Q_W.grad.zero_()
            K_W.grad.zero_()
            V_W.grad.zero_()
        self.W_1.data -= learning_rate * self.W_1.grad
        self.W_2.data -= learning_rate * self.W_2.grad
        self.b_1.data -= learning_rate * self.b_1.grad
        self.b_2.data -= learning_rate * self.b_2.grad
        self.W_1.grad.zero_()
        self.W_2.grad.zero_()
        self.b_1.grad.zero_()
        self.b_2.grad.zero_()
        

In [21]:
tran_block = TransformerBlock(d_model, d_k, d_v, d_f_f, h)
tran_block.foward(embedding).size()

torch.Size([5, 512])

In [22]:
tran_blocks = []
for i in range(n):
    tran_blocks.append(TransformerBlock(d_model, d_k, d_v, d_f_f, h))

E_run = embedding
for tran_block in tran_blocks:
    E_run = tran_block.foward(E_run)

tran_out = E_run

tran_out



tensor([[-0.9059,  0.5338, -2.3796,  ...,  0.5336,  0.2879, -0.4246],
        [-0.8938,  0.7856, -2.0914,  ...,  0.6564,  0.1267, -0.3184],
        [-0.8625,  0.6496, -1.9539,  ...,  0.4155,  0.0739, -0.3781],
        [-0.7270,  0.8884, -1.9329,  ...,  0.5687, -0.1540, -0.2023],
        [-0.7270,  0.8884, -1.9329,  ...,  0.5687, -0.1540, -0.2023]],
       device='cuda:0', grad_fn=<MulBackward0>)

In [23]:
tran_out.size()

torch.Size([5, 512])

In [24]:
final_linear = tran_out @ W_E.T

In [25]:
final_linear.size()

torch.Size([5, 2924])

In [26]:
max_final = torch.max(final_linear, 1, keepdim=True)[0]
exp_softmax = torch.exp(final_linear-max_final)
sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
final_output = exp_softmax/sum_softmax

In [27]:
final_output

tensor([[7.5428e-35, 1.5197e-39, 2.0894e-25,  ..., 1.2045e-31, 2.0908e-28,
         1.6687e-41],
        [1.2802e-33, 5.1568e-43, 2.9224e-25,  ..., 7.5607e-30, 5.5299e-27,
         1.3816e-40],
        [2.6075e-35, 7.4269e-44, 1.1472e-25,  ..., 2.0649e-32, 1.1225e-27,
         1.1467e-39],
        [5.4939e-31, 4.2036e-41, 1.4182e-25,  ..., 2.2451e-31, 5.5102e-26,
         1.8749e-36],
        [5.4885e-31, 4.2082e-41, 1.4168e-25,  ..., 2.2438e-31, 5.4994e-26,
         1.8757e-36]], device='cuda:0', grad_fn=<DivBackward0>)

In [28]:
output_vals = torch.argmax(final_output, axis = 1)
for val in output_vals:
    print(list(words)[val]) 

church.
let's
let's
let's
let's


In [51]:
d_model = 512
h = 4
n = 6
d_k = int(d_model / h)
d_v = d_k
attention_scaling = 1/(math.sqrt(d_k))
d_f_f = 4*d_model
num_epochs = 10
learning_rate = 0.01
input_ = torch.stack(one_hot_encoded[0:500])

tran_blocks = []
for i in range(n):
        tran_blocks.append(TransformerBlock(d_model, d_k, d_v, d_f_f, h))
W_E = torch.randn(token_count, d_model, requires_grad=True)

for i in range(num_epochs):
    embedding = input_ @ W_E 

    embedding += positional_encoding(embedding)

    E_run = embedding
    for tran_block in tran_blocks:
        E_run = tran_block.foward(E_run)

    tran_out = E_run

    final_linear = tran_out @ W_E.T

    max_final = torch.max(final_linear, 1, keepdim=True)[0]
    exp_softmax = torch.exp(final_linear-max_final)
    sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
    final_output = exp_softmax/sum_softmax
    cross_entropy = -1*((input_*torch.log(final_output + 1e-9)).sum())
    cross_entropy.backward()
    W_E.data -= learning_rate * W_E.grad
    W_E.grad.zero_()





KeyboardInterrupt: 

In [50]:
def foward_tran(input_):
    embedding = input_ @ W_E 

    embedding += positional_encoding(embedding)

    E_run = embedding
    for tran_block in tran_blocks:
        E_run = tran_block.foward(E_run)

    tran_out = E_run

    final_linear = tran_out @ W_E.T

    max_final = torch.max(final_linear, 1, keepdim=True)[0]
    exp_softmax = torch.exp(final_linear-max_final)
    sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
    final_output = exp_softmax/sum_softmax
    print(list(words)[torch.argmax(final_output[-1])])
    return word_dict[list(words)[torch.argmax(final_output[-1])]]
prompt = torch.stack([word_dict["black"], word_dict["and"], word_dict["yellow"]])
for i in range(10):
    new_word = foward_tran(prompt)
    prompt = torch.stack([*prompt,new_word])



aviation,
aviation,
aviation,
aviation,
aviation,
aviation,
aviation,
aviation,
aviation,
aviation,
