In [1]:
# Attention Mechanism : Transformer 

In [3]:
import math
import random

# maths 
def dot(v1, v2):
    return sum([x * y for x, y in zip(v1, v2)])

def matmul(A, B):
    return [[dot(row, col) for col in zip(*B)] for row in A]

def relu(x):
    return [max(0, val) for val in x]

def softmax(x):
    max_val = max(x)
    exps = [math.exp(i - max_val) for i in x]
    sum_exps = sum(exps)
    return [j / sum_exps for j in exps]

def add_vectors(a, b):
    return [x + y for x, y in zip(a, b)]

def transpose(m):
    return list(map(list, zip(*m)))

# encoding
def positional_encoding(length, d_model):
    pe = []
    for pos in range(length):
        row = []
        for i in range(d_model):
            angle = pos / (10000 ** (2 * (i // 2) / d_model))
            if i % 2 == 0:
                row.append(math.sin(angle))
            else:
                row.append(math.cos(angle))
        pe.append(row)
    return pe


def linear(input_vec, weight, bias):
    output = []
    for w_row, b in zip(weight, bias):
        output.append(dot(input_vec, w_row) + b)
    return output


def scaled_dot_attention(Q, K, V):
    d_k = len(Q[0])
    scores = matmul(Q, transpose(K))
    scaled_scores = [[x / math.sqrt(d_k) for x in row] for row in scores]
    attn_weights = [softmax(row) for row in scaled_scores]
    output = matmul(attn_weights, V)
    return output


def feed_forward(x, w1, b1, w2, b2):
    x1 = [relu(linear(vec, w1, b1)) for vec in x]
    x2 = [linear(vec, w2, b2) for vec in x1]
    return x2


def transformer_encoder(x, QW, KW, VW, OW, OB, FF1W, FF1B, FF2W, FF2B):
    # Self Attention
    Q = [linear(vec, QW, [0]*len(QW)) for vec in x]
    K = [linear(vec, KW, [0]*len(KW)) for vec in x]
    V = [linear(vec, VW, [0]*len(VW)) for vec in x]
    attn_out = scaled_dot_attention(Q, K, V)
    
    # Add & Norm (skipped real normalization for simplicity)
    x = [add_vectors(a, b) for a, b in zip(x, attn_out)]
    
    # Feedforward
    ff_out = feed_forward(x, FF1W, FF1B, FF2W, FF2B)
    
    # Add & Norm
    x = [add_vectors(a, b) for a, b in zip(x, ff_out)]
    return x



if __name__ == "__main__":
    seq_len = 3
    d_model = 4

    # Dummy input (sequence of 3 vectors of size 4)
    x = [[random.random() for _ in range(d_model)] for _ in range(seq_len)]

    # Random weights
    QW = [[random.random() for _ in range(d_model)] for _ in range(d_model)]
    KW = [[random.random() for _ in range(d_model)] for _ in range(d_model)]
    VW = [[random.random() for _ in range(d_model)] for _ in range(d_model)]
    OW = [[random.random() for _ in range(d_model)] for _ in range(d_model)]
    OB = [0] * d_model

    FF1W = [[random.random() for _ in range(d_model)] for _ in range(8)]
    FF1B = [0] * 8
    FF2W = [[random.random() for _ in range(8)] for _ in range(d_model)]
    FF2B = [0] * d_model

    output = transformer_encoder(x, QW, KW, VW, OW, OB, FF1W, FF1B, FF2W, FF2B)
    print("Transformer Output:")
    for row in output:
        print([round(val, 3) for val in row])


Transformer Output:
[22.299, 7.609, 12.649, 15.634]
[22.256, 8.09, 12.248, 15.595]
[21.44, 7.615, 12.711, 15.377]
