# CSC 5611
# Transformer Week 10 Submission
Konrad Rozpadek

4/3/2025

In [96]:
import torch
import math
import json

In [97]:
torch.set_default_dtype(torch.float32)
if torch.cuda.is_available():
     torch.set_default_device(0)
     print("Running on the GPU")
else:
     print("Running on the CPU")

Running on the GPU


In [98]:
with open("bee20script.txt", "r") as file:
    data = file.read()
    data = data.replace('\n', ' ')
    
words = set(data.split(" "))
word_dict = {}
unique_words = len(words)
for i, word in enumerate(words):
    one_hot = torch.zeros(unique_words)
    one_hot[i] = 1
    word_dict[word] = one_hot


In [99]:
one_hot_encoded = []
for word in data.split(" "):
    one_hot_encoded.append(word_dict[word])

In [100]:
token_count = len(word_dict)
token_count

2924

In [101]:
def positional_encoding(E):
    num_tokens = E.size(0)
    encoding = torch.zeros(num_tokens, d_model)
    for pos in range(num_tokens):
        for i in range(0,d_model,2):
            encoding[pos, i] = math.sin(pos/(10000 ** ((2 * i) / d_model)))
            encoding[pos, i + 1] = math.cos(pos/(10000 ** ((2 * i) / d_model)))
    return encoding
    

In [103]:
class TransformerBlock():    
    def __init__(self, d_model, d_k, d_v, d_f_f, h):
        self.attention_scaling = 1/(math.sqrt(d_k))
        self.W_O = torch.randn(h*d_v, d_model) * 0.1
        self.W_O.requires_grad=True
        #tuples with weights in order Q, K, V for each head
        self.head_weights = []
        for i in range(h):
            W_Q = torch.randn(d_model, d_k) * 0.1
            W_Q.requires_grad = True
            W_K = torch.randn(d_model, d_k) * 0.1
            W_K.requires_grad = True
            W_V = torch.randn(d_model, d_v) * 0.1
            W_V.requires_grad = True
            self.head_weights.append((W_Q, W_K, W_V))
        self.rms_norm = torch.nn.RMSNorm(d_model)
        self.W_1 = torch.randn(d_model, d_f_f) * 0.1
        self.W_1.requires_grad=True
        self.W_2 = torch.randn(d_f_f, d_model) * 0.1
        self.W_2.requires_grad = True
        self.b_1 = torch.randn(1, d_f_f) * 0.1
        self.b_1.requires_grad=True
        self.b_2 = torch.randn(1, d_model) * 0.1
        self.b_2.requires_grad = True
        self.rms_norm = torch.nn.RMSNorm(d_model)


    def multi_head_attention(self, E):
        heads = []
        for weights in self.head_weights:
            Q_W = weights[0]
            K_W = weights[1]
            V_W = weights[2]

            Q = E @ Q_W
            K = E @ K_W
            V = E @ V_W
            heads.append(self.attention(Q, K, V))
        return torch.cat(heads, dim=1) @ self.W_O
    def attention(self, Q, K, V):
        y_1 = Q @ K.t()
        
        y_2 = self.attention_scaling * y_1
        
        y_3 = self.attention_mask(y_2)
        
        max_y_3 = torch.max(y_3, 1, keepdim=True)[0]
        exp_softmax = torch.exp(y_3-max_y_3)
        sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
        y_4 = exp_softmax/sum_softmax

        y_5 = y_4 @ V
        
        return y_5
    def feed_foward(self, input_):
        linear_1 = input_ @ self.W_1 + self.b_1
        relu = torch.max(torch.zeros(linear_1.size()), linear_1)
        linear_2 = relu @ self.W_2 + self.b_2
        return linear_2
    def attention_mask(self, input_):
        mask = torch.tril(input_, diagonal=0)
        return mask.masked_fill(mask == 0, float('-inf'))
    def add_and_norm(self, E, transformed_E):
        add = E + transformed_E
        return self.rms_norm(add)
    def foward(self, E):
        transformed_E = self.multi_head_attention(E)
        normed_tran_E = self.add_and_norm(E, transformed_E)
        feed_foward_E = self.feed_foward(normed_tran_E)
        output = self.add_and_norm(normed_tran_E, feed_foward_E)
        return output
    def step(self, learning_rate):
        for weights in self.head_weights:
            Q_W = weights[0]
            K_W = weights[1]
            V_W = weights[2]

            Q_W.data -= learning_rate * Q_W.grad
            K_W.data -= learning_rate * K_W.grad
            V_W.data -= learning_rate * V_W.grad
            Q_W.grad.zero_()
            K_W.grad.zero_()
            V_W.grad.zero_()
        self.W_1.data -= learning_rate * self.W_1.grad
        self.W_2.data -= learning_rate * self.W_2.grad
        self.b_1.data -= learning_rate * self.b_1.grad
        self.b_2.data -= learning_rate * self.b_2.grad
        self.W_1.grad.zero_()
        self.W_2.grad.zero_()
        self.b_1.grad.zero_()
        self.b_2.grad.zero_()
        

In [117]:
d_model = 512
h = 4
n = 6
d_k = int(d_model / h)
d_v = d_k
d_f_f = 4*d_model
num_epochs = 100
learning_rate = 0.01

#Setup of weight inputs
tran_blocks = []
for i in range(n):
        tran_blocks.append(TransformerBlock(d_model, d_k, d_v, d_f_f, h))
W_E = torch.randn(token_count, d_model) * 0.1
W_E.requires_grad=True

# Training

In [118]:
input_ = torch.stack(one_hot_encoded[0:500])
for i in range(num_epochs):
    #Output Embedding Layer
    embedding = input_ @ W_E 

    # Positional Encoding
    embedding += positional_encoding(embedding)

    # Decoder Block with n decoders
    E_run = embedding
    for tran_block in tran_blocks:
        E_run = tran_block.foward(E_run)
    tran_out = E_run

    #Final Linear Layer reusing embedding matrix
    final_linear = tran_out @ W_E.T

    #Softmax
    max_final = torch.max(final_linear, 1, keepdim=True)[0]
    exp_softmax = torch.exp(final_linear-max_final)
    sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
    final_output = exp_softmax/sum_softmax

    #Cross Entropy
    cross_entropy = -1*((input_*torch.log(final_output + 1e-9)).sum())
    cross_entropy.backward()
    
    #Stochastic gradient descent step
    W_E.data -= learning_rate * W_E.grad
    W_E.grad.zero_()
    for tran_block in tran_blocks:
        tran_block.step(learning_rate)

# Test after some training

In [122]:
def foward_tran(input_):
    embedding = input_ @ W_E 

    embedding += positional_encoding(embedding)

    E_run = embedding
    for tran_block in tran_blocks:
        E_run = tran_block.foward(E_run)

    tran_out = E_run

    final_linear = tran_out @ W_E.T

    max_final = torch.max(final_linear, 1, keepdim=True)[0]
    exp_softmax = torch.exp(final_linear-max_final)
    sum_softmax = torch.sum(exp_softmax, 1, keepdim=True)
    final_output = exp_softmax/sum_softmax
    print(list(words)[torch.argmax(final_output[-1])])
    return word_dict[list(words)[torch.argmax(final_output[-1])]]
prompt = torch.stack([word_dict["black"], word_dict["and"], word_dict["yellow"]])
for i in range(10):
    new_word = foward_tran(prompt)
    prompt = torch.stack([*prompt,new_word])

possession
possession
possession
possession
possession
possession
possession
possession
possession
possession
