In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

def tokenize(text, vocab):
  return [vocab.get(word, vocab["<UNK>"]) for word in text.split()]

In [2]:
class Embedding(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(Embedding, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
  def forward(self, x):
    return self.embedding(x)



In [3]:
class PositionalEncoding(nn.Module):
  def __init__(self, embedding_dim, max_seq_len=5000):
    super(PositionalEncoding,self).__init__()
    self.embedding_dim = embedding_dim
    pe = torch.zeros(max_seq_len, embedding_dim)
    position = torch.arange(0,max_seq_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embedding_dim,2).float() * (-math.log(10000.0)/ embedding_dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0,1)
    self.register_buffer('pe', pe)
  def forward(self, x):
    return x + self.pe[:x.size(0),:]


In [4]:
from torch._C import Value
class SelfAttention(nn.Module):
  def __init__(self, embedding_dim):
    super(SelfAttention, self).__init__()
    self.query = nn.Linear(embedding_dim,embedding_dim)
    self.key = nn.Linear(embedding_dim, embedding_dim)
    self.value = nn.Linear(embedding_dim,embedding_dim)

  def forward(self, x):
    queries = self.query(x)
    keys = self.key(x)
    Values = self.value(x)
    scores = torch.bmm(queries, keys.transpose(1/2))/ troch.sqrt(torch.tensor(x.size(-1), dtype=torch.float32))
    attention_weights = torch.softmax(scores, dim=-1)
    attention_values = torch.bmm(attention_weights, values)
    return attention_values

In [5]:
class TransformerBlock(nn.Module):
  def __init__(self, embedding_dim,hidden_dim):
    super(TransformerBlock, self).__init__()
    self.attention = SelfAttention(embedding_dim)
    self.feed_forward = nn.Sequential(nn.Linear(embedding_dim, hidden_dim),nn.ReLU(), nn.Linear(hidden_dim, embedding_dim))
    self.norm1 = nn.LayerNorm(embedding_dim)
    self.norm2 = nn.LayerNorm(embedding_dim)
  def forward(self, x):
    attention = self.attention(x)
    x = self.norm1(x + attention)
    forwarded = self.feed_forward(x)
    x = self.norm2(x + forwarded)
    return x


In [10]:
class SimpleLLM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
    super(SimpleLLM, self).__init__()
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.positional_encoding = PositionalEncoding(embedding_dim)
    self.transformer_blocks = nn.Sequential(*[TransformerBlock(embedding_dim,hidden_dim) for _ in range(num_layers)])
    self.output = nn.Linear(embedding_dim, vocab_size)
  def forward(self,x):
    x = self.embedding(x)
    x = x.transpose(0,1)
    x = self.positional_encoding(x)
    x = x.transpose(0,1)
    x = self.transformer_blocks(x)
    x = self.output(x)
    return x

In [15]:
vocab = {"hello":0,"world":1,"how":2,"are":3,"you":4,"<UNK>":5}
vocab_size = len(vocab)
embedding_dim = 16
hidden_dim = 32
num_layer = 2

model = SimpleLLM(vocab_size, embedding_dim, hidden_dim, num_layer)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
data = ["hello world how are you","how are you hello world"]
tokenized_data = [tokenize(sentence, vocab) for sentence in data]

for epoch in range(100):
    total_loss = 0

    for sentence in tokenized_data:
        for i in range(1, len(sentence)):
            input_seq = torch.tensor(sentence[:i]).unsqueeze(0)
            target = torch.tensor(sentence[i]).unsqueeze(0)

            optimizer.zero_grad()
            output = model(input_seq)

            loss = criterion(output[:, -1, :], target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")


Epoch 0, Loss: 15.0987
Epoch 10, Loss: 10.7659
Epoch 20, Loss: 6.0740
Epoch 30, Loss: 3.0810
Epoch 40, Loss: 1.6915
Epoch 50, Loss: 0.7919
Epoch 60, Loss: 0.4311
Epoch 70, Loss: 0.2767
Epoch 80, Loss: 0.1944
Epoch 90, Loss: 0.1445


In [17]:
input_text = "hello world how"
input_tokens = tokenize(input_text, vocab)
input_tensor = torch.tensor(input_tokens).unsqueeze(0)

output = model(input_tensor)
predicted_token = torch.argmax(output[:, -1, :], dim=-1).item()

inv_vocab = {v: k for k, v in vocab.items()}
predicted_word = inv_vocab[predicted_token]

print(f"Input: {input_text}, Predicted: {predicted_word}")


Input: hello world how, Predicted: are
