In [19]:
import torch
import torch.nn.functional as F

# The mathematical trick in self-attention

In [2]:
# consider the following toy example:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch size, sequence length, and number of classes
x = torch.randn(B, T, C) # input sequence
x.shape

torch.Size([4, 8, 2])

In [4]:
# version 1
# We want x[b,t] = mean_{i<=t} x[b,i] for all b,t
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, dim=0) # (C,)

In [13]:
# version 2
wei = torch.tril(torch.ones((T, T))) # (T, T)
wei = wei / wei.sum(dim=1, keepdim=True) # (T, T)
xbow2 = wei @ x # (T, C) @ (B, T, C) -> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [22]:
# version 3: use Softmax
tril = torch.tril(torch.ones((T, T))) # (T, T)
wei = torch.zeros((T, T)) # (T, T)
wei = wei.masked_fill(tril == 0, float('-inf')) # (T, T)
wei = F.softmax(wei, dim=-1) # (T, T)
xbow3 = wei @ x # (T, C) @ (B, T, C) -> (B, T, C)

In [12]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / a.sum(dim=1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [27]:
import torch.nn as nn

# version 4: self-attention!
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch size, sequence length, and emb dim
x = torch.randn(B, T, C) # input sequence

# let's implement a single Head of self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)

wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)



tril = torch.tril(torch.ones((T, T))) # (T, T)
wei = torch.zeros((T, T)) # (T, T)
wei = wei.masked_fill(tril == 0, float('-inf')) # (T, T)
wei = F.softmax(wei, dim=-1) # (T, T)

v = value(x) # (B, T, head_size)

out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [28]:
k.var()

tensor(0.3164, grad_fn=<VarBackward0>)

In [29]:
wei.var()

tensor(0.0273)

In [3]:
quotes = [
    1.2,
    6.5,
    12.0
]

In [5]:
# Compute the best possible return on investment (ROI) for the given quotes list
# The amount to invest is 50 dollars
# The best possible ROI is 12.0
# The dollars can be split in any way among the quotes
import random

for i in range(100):
    random.seed(i)
    quotes = [random.random() * 10 for _ in range(100)]
    amount = 50
    best_roi = 0
    for i in range(len(quotes)):
        for j in range(i+1, len(quotes)):
            roi = quotes[j] / quotes[i]
            if roi > best_roi:
                best_roi = roi
    print(best_roi)

871.754464398402
471.28122924930483
41.8593451006471
75.8284922354904
83.40855450323167
561.5798303176996
2213.5170424448693
42.4471150909322
82.24714247519591
155.08249939894955
238.87639026410386
1815.4421458356044
149.60404951885167
68.29644870284929
51.44487638462605
85.78657251727243
102.9197389742889
101.07086503018294
149.2374891936262
137.81200078275265
476.31533572953214
313.63629766994995
42.28592614208186
42.298761032962716
52.64022282209242
58.53996822573236
128.7984217845519
258.18927818034996
276.792449647354
128.28941384112343
126.79705846781744
81.1470610829842
1323.2338005636861
150.9131498228845
17.24824602849145
241.61472662632283
2481.106179373472
393.7196541019415
85.15148844801142
1859.9063923681458
49.624388183978844
44.66376257797107
153.4966141097154
392.615986809046
142.58727766478265
88.03559102589642
966.9462112080839
270.80435402425474
1320.1805188457563
21.759496992843324
145.07688928126015
64.46665259612502
231.96086163318336
29.790848096992697
77.7851836