In [1]:
import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
names = open('../data/names.txt', 'r').read().splitlines()
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
chs = sorted(set(''.join(names)))
stoi = {s: i+1 for i, s in enumerate(chs)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [4]:
#create data set
X, y = [], []

block_size = 3
emb_size = 2
emb_cat_size = block_size * emb_size


for name in names[:5]:
    
    context = [0] * block_size
    
    for ch in name + '.':
        
        idx = stoi[ch]
        
        X.append(context)
        y.append(idx)
        
        
        context = context[1:] + [idx]      
        
X = torch.tensor(X)
y = torch.tensor(y)

# X

In [5]:
C = torch.randn(27, emb_size)

In [6]:
X.shape

torch.Size([32, 3])

In [7]:
emb = C[X]
emb[:1]

tensor([[[1.1661, 0.6062],
         [1.1661, 0.6062],
         [1.1661, 0.6062]]])

In [50]:
emb = C[X].view(-1, emb_cat_size)

In [51]:
W1 = torch.randn(emb_cat_size, 100) #6 x 100
b1 = torch.randn(100) #1 x 100

In [52]:
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [53]:
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [54]:
h = torch.tanh(emb @ W1 + b1) # 32 x 100

In [31]:
f"# of parameters = {sum(p.nelement() for p in parameters)}"

'# of parameters = 3481'

In [55]:
logits = h @ W2 + b2

loss = F.cross_entropy(logits, y)
loss

tensor(17.8374, grad_fn=<NllLossBackward0>)

In [56]:
for p in parameters:
    p.grad = None

In [57]:
loss.backward()

In [58]:
for p in parameters:
    p.data += -0.01 * p.grad

In [59]:
print(loss.item())

17.837352752685547


In [48]:
#mine combined - works, but above same code doesn't. Because defining the weights?
emb = C[X].view(-1, emb_cat_size)
W1 = torch.randn(emb_cat_size, 100) #6 x 100
b1 = torch.randn(100) #1 x 100
h = torch.tanh(emb @ W1 + b1) # 32 x 100
W2 = torch.randn(100, 27)
b2 = torch.randn(27)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, y)
print(loss.item())
for p in parameters:
    p.grad = None
loss.backward()
for p in parameters:
    p.data += -0.01 * p.grad

13.807954788208008


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [142]:
W1 = torch.randn(emb_cat_size, 100) #6 x 100
b1 = torch.randn(100) #1 x 100

W2 = torch.randn(100, 27)
b2 = torch.randn(27)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
    
for p in parameters:
    p = p.cpu()

In [143]:
#mine combined - works, but above same code doesn't. Because defining the weights?
for i in range(10000):
    
    emb = C[X].view(-1, emb_cat_size)

    h = torch.tanh(emb @ W1 + b1) # 32 x 100

    logits = h @ W2 + b2

    loss = F.cross_entropy(logits, y)
    if i % 1000 == 0:
        print(loss.item())

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data += -0.01 * p.grad

14.539395332336426
0.2939387559890747
0.26897290349006653
0.2625206708908081
0.25955408811569214
0.2578466832637787
0.2567363977432251
0.2559562027454376
0.25537773966789246
0.2549316883087158


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

C = torch.randn(27, emb_size, device=device)

W1 = torch.randn(emb_cat_size, 100, device=device) #6 x 100
b1 = torch.randn(100, device=device) #1 x 100

W2 = torch.randn(100, 27, device=device)
b2 = torch.randn(27, device=device)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [146]:
for i in range(10000):
    
    emb = C[X].view(-1, emb_cat_size).to(device)

    h = torch.tanh(emb @ W1 + b1) # 32 x 100

    logits = h @ W2 + b2

    loss = F.cross_entropy(logits, y.to(device))
    if i % 1000 == 0:
        print(loss.item())

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data += -0.01 * p.grad.to(device)

0.2552911341190338
0.25491511821746826
0.2546055018901825
0.2543465793132782
0.2541268765926361
0.25393787026405334
0.253773957490921
0.2536304295063019
0.253503680229187
0.25339099764823914


In [36]:
# #mine combined - works, but above same code doesn't. Because defining the weights?
# emb = C[X].view(-1, emb_cat_size)
# h = torch.tanh(emb @ W1 + b1) # 32 x 100
# logits = h @ W2 + b2
# loss = F.cross_entropy(logits, y)
# print(loss.item())
# for p in parameters:
#     p.grad = None
# loss.backward()
# for p in parameters:
#     p.data += -0.01 * p.grad

14.507288932800293


In [None]:
# for _ in range(1):
#     #forward pass
#     emb = C[X] # (32, 3, 2)
#     h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
#     logits = h @ W2 + b2
#     loss = F.cross_entropy(logits, y)
#     print(loss.item())
#     #backward pass
#     for p in parameters:
#         p.grad = None
#     loss.backward()
#     for p in parameters:
#         p.data += -0.1 * p.grad