In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class CustomEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(CustomEmbeddingLayer, self).__init__()
        self.embedding = nn.Parameter(torch.randn(vocab_size, embed_dim))

    def forward(self, x):
        return torch.matmul(x, self.embedding)

# samlpe simple linear layer
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SimpleModel, self).__init__()
        self.embedding_layer = CustomEmbeddingLayer(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding_layer(x)
        out = self.linear(embeds)
        return out

# define parameters
## vocabs taken from BPE we coded in /tokenizer/BPE.ipynb
vocabs = {'w': 0, 'f': 1, 's</w>': 2, 'd': 3, 'h': 4, 'the': 5, 'c': 6, 'a': 7, 'r': 8, 'o': 9, 'k': 10, 'in': 11, 'u': 12, 'b': 13, '.</w>': 14, 'n': 15, 'l': 16, 'e</w>': 17, 'e': 18, 'g': 19, 'x': 20, '-': 21, 'an': 22, 'he': 23, 'm': 24, '</w>': 25, ',</w>': 26, 'p': 27, 'v': 28, 'i': 29, 'd</w>': 30, 'y': 31, 's': 32, 't': 33, 'er': 34}
vocab_size = len(vocabs)
embed_dim = 4

# iiitiate model
model = SimpleModel(vocab_size, embed_dim)

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# sample training data: batch of 2 sequences, each with 4 token indices
# for simplicity, using one-hot encoded inputs
sample_input = torch.eye(vocab_size)[[1, 2, 3, 4, 4, 3, 2, 1]].reshape(2, 4, vocab_size)
sample_target = torch.tensor([[2, 3, 4, 5], [5, 4, 3, 2]], dtype=torch.long)

# training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    # forward pass
    output = model(sample_input)
    # compute loss
    loss = criterion(output.view(-1, vocab_size), sample_target.view(-1))
    # backward and optimizer
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 3.7782
Epoch [11/100], Loss: 3.7021
Epoch [21/100], Loss: 3.6275
Epoch [31/100], Loss: 3.5542
Epoch [41/100], Loss: 3.4820
Epoch [51/100], Loss: 3.4110
Epoch [61/100], Loss: 3.3410
Epoch [71/100], Loss: 3.2718
Epoch [81/100], Loss: 3.2036
Epoch [91/100], Loss: 3.1360


In [3]:
learned_embeddings = model.embedding_layer.embedding.data
print("Learned Embeddings:", learned_embeddings)
print(f"Learned Embedding size: {learned_embeddings.shape}")

Learned Embeddings: tensor([[-1.0888, -0.2824,  0.4090,  0.6687],
        [ 0.9902,  0.1060, -0.1460,  0.9678],
        [ 1.1455,  0.5316, -0.0126,  0.1891],
        [-0.4129, -0.5683,  0.2142, -0.6977],
        [-1.2059,  0.6726, -0.5572,  0.1176],
        [ 0.7144, -2.4413,  0.4394,  1.0270],
        [-1.6094, -0.5242,  0.4525, -0.1211],
        [-0.1593, -0.2036,  0.9511, -0.4252],
        [ 0.7152, -0.6197,  0.7950,  1.1820],
        [-0.0077, -1.6696, -1.3080,  0.3963],
        [-0.8808,  0.1096,  1.1021,  2.3211],
        [-1.5365,  1.3074,  2.4265, -1.3851],
        [-0.3820,  2.0571,  0.5493, -0.5722],
        [ 1.8188,  0.0133, -0.3563, -0.2530],
        [ 1.5999,  0.5760,  0.3622, -1.4171],
        [ 1.0916, -1.0916, -0.5442, -0.4348],
        [ 1.9268, -0.1019, -0.1726, -0.9178],
        [ 0.2730,  0.1792, -0.5092, -0.1255],
        [-1.0621,  0.8124, -1.2130, -1.7982],
        [ 0.5199,  0.4318, -0.6762, -0.2020],
        [-1.1380, -0.7472,  0.6408,  0.0838],
        [-1.03

In [4]:
print(f"Embedding for 'w': {learned_embeddings[vocabs['w']]}")

Embedding for 'w': tensor([-1.0888, -0.2824,  0.4090,  0.6687])


In [5]:
print(f"Embedding for 'the': {learned_embeddings[vocabs['the']]}")

Embedding for 'the': tensor([ 0.7144, -2.4413,  0.4394,  1.0270])
