In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [15]:
class CustomEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(CustomEmbeddingLayer, self).__init__()
        self.embedding = nn.Parameter(torch.randn(vocab_size, embed_dim))

    def forward(self, x):
        return torch.matmul(x, self.embedding)

# samlpe simple linear layer
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SimpleModel, self).__init__()
        self.embedding_layer = CustomEmbeddingLayer(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding_layer(x)
        out = self.linear(embeds)
        return out

# define parameters
## vocabs taken from BPE we coded in /tokenizer/BPE.ipynb
vocabs = {'w': 0, 'f': 1, 's</w>': 2, 'd': 3, 'h': 4, 'the': 5, 'c': 6, 'a': 7, 'r': 8, 'o': 9, 'k': 10, 'in': 11, 'u': 12, 'b': 13, '.</w>': 14, 'n': 15, 'l': 16, 'e</w>': 17, 'e': 18, 'g': 19, 'x': 20, '-': 21, 'an': 22, 'he': 23, 'm': 24, '</w>': 25, ',</w>': 26, 'p': 27, 'v': 28, 'i': 29, 'd</w>': 30, 'y': 31, 's': 32, 't': 33, 'er': 34}
vocab_size = len(vocabs)
embed_dim = 3

# iiitiate model
model = SimpleModel(vocab_size, embed_dim)

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# sample training data: batch of 2 sequences, each with 4 token indices
# for simplicity, using one-hot encoded inputs
sample_input = torch.eye(vocab_size)[[1, 2, 3, 4, 4, 3, 2, 1]].reshape(2, 4, vocab_size)
sample_target = torch.tensor([[2, 3, 4, 5], [5, 4, 3, 2]], dtype=torch.long)

# training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    # forward pass
    output = model(sample_input)
    # compute loss
    loss = criterion(output.view(-1, vocab_size), sample_target.view(-1))
    # backward and optimizer
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 3.7846
Epoch [11/100], Loss: 3.6986
Epoch [21/100], Loss: 3.6135
Epoch [31/100], Loss: 3.5292
Epoch [41/100], Loss: 3.4458
Epoch [51/100], Loss: 3.3631
Epoch [61/100], Loss: 3.2810
Epoch [71/100], Loss: 3.1996
Epoch [81/100], Loss: 3.1187
Epoch [91/100], Loss: 3.0385


In [19]:
learned_embeddings = model.embedding_layer.embedding.data
print("Learned Embeddings:", learned_embeddings)
print(f"Learned Embedding size: {learned_embeddings.shape}")

Learned Embeddings: tensor([[ 1.1371,  0.9943,  0.0375],
        [-1.6018, -0.3988, -0.9190],
        [ 0.5137,  1.5477,  0.5030],
        [-0.4879, -0.9742, -0.0280],
        [ 0.2598,  1.1345,  0.4362],
        [ 1.4705, -0.6515,  0.0917],
        [-0.0590, -1.2082,  0.1376],
        [ 0.1255,  0.7336, -1.5637],
        [-0.2186,  0.9550,  1.5034],
        [ 1.6238, -1.2249, -1.6470],
        [ 0.4653,  0.1373, -2.0911],
        [-2.1927,  0.7636,  0.2861],
        [ 0.6090, -0.2787, -0.2165],
        [ 0.2153, -0.9786, -0.9581],
        [-0.3860, -0.5954, -0.8105],
        [-0.8320, -1.4884,  0.5985],
        [ 0.7002,  0.6555,  0.8853],
        [ 0.0209, -0.3941,  0.8519],
        [ 0.7330,  1.8007,  0.9618],
        [ 0.1652, -0.8729, -0.4493],
        [ 2.4194,  0.1464, -0.9133],
        [ 0.4371, -0.5891, -0.4020],
        [ 1.4615,  1.9300,  1.3794],
        [-1.3398,  1.9100,  0.1663],
        [ 0.4614,  0.6157, -0.3563],
        [ 1.4832,  0.1718, -1.1843],
        [ 1.0340, 

In [17]:
print(f"Embedding for 'w': {learned_embeddings[vocabs['w']]}")

Embedding for 'w': tensor([1.1371, 0.9943, 0.0375])


In [21]:
print(f"Embedding for 'the': {learned_embeddings[vocabs['the']]}")

Embedding for 'the': tensor([ 1.4705, -0.6515,  0.0917])
