In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchvision.transforms import v2
from datasets import load_dataset
import tiktoken

In [2]:
torch.manual_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
dataset = load_dataset("roneneldan/TinyStories")
train = dataset["train"]["text"]

In [4]:
encoder = tiktoken.get_encoding("cl100k_base")

In [5]:
with open("train_data.bin", "wb") as file:
    for i, text in enumerate(train):
        encoding = encoder.encode(text)
        np.array(encoding, dtype=np.int32).tofile(file)
        
        print(f"\rSaving data {(i/len(train))*100:.2f}% complete.", end="")



Saving data 100.00% complete.

In [5]:
train_tokens = torch.tensor(np.fromfile("train_data.bin", dtype=np.int32), dtype=torch.int32).to(device)
train_tokens = train_tokens.unsqueeze(0)
# train_tokens.shape

In [6]:
d_model  = 128
d_query  = 64
n_heads  = 4
n_vocab  = encoder.n_vocab
n_layers = 4

In [7]:
print(n_vocab)

100277


In [19]:
# taken from https://pytorch-tutorials-preview.netlify.app/beginner/transformer_tutorial.html
import math

class PositionalEncoding(nn.Module):

    def __init__(self, 
                 d_model: int, 
                 dropout: float = 0.1, 
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [20]:
class SelfAttention(nn.Module):
    def __init__(self, 
                 d_model: int, 
                 d_query: int = 128, 
                 n_heads: int = 8):
        super().__init__()

        self.W_q = nn.Linear(d_model, d_query)
        self.W_k = nn.Linear(d_model, d_query)
        self.W_v = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):

        q = self.W_q(x)
        k = self.W_k(x)
        v = self.W_v(x)

        attention_pattern = torch.matmul(q, torch.transpose(k, 1, 2))
        attention_pattern = self.softmax(attention_pattern)

        output = torch.matmul(attention_pattern, v)
        
        return output



In [21]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, 
                 d_model: int, 
                 d_up: int = 256):
        super().__init__()

        self.up = nn.Linear(d_model, d_up)
        self.relu = nn.ReLU()
        self.down = nn.Linear(d_up, d_model)
    
    def forward(self, x):

        output = self.up(x)
        output = self.relu(output)
        output = self.down(output)

        output = output + x

        return output

In [22]:
class Transformer(nn.Module):
    def __init__(self, 
                 n_vocab: int, 
                 d_model: int = 128, 
                 d_query: int = 128, 
                 n_heads: int = 8, 
                 n_layers: int = 4, 
                 d_up: int = 256):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab, d_model)
        self.pe = PositionalEncoding(d_model, max_len=50000)

        self.self_attention = SelfAttention(d_model, d_query, n_heads)

        self.mlp = MultilayerPerceptron(d_model, d_up)

        self.unembedding = nn.Linear(d_model, n_vocab)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pe(x)

        x = self.self_attention(x)

        x = self.mlp(x)

        x = self.unembedding(x)

        output = self.softmax(x)
        
        return x, output

In [23]:
model = Transformer(n_vocab, d_model, d_query, n_heads, n_layers).to(device)
x, output = model(train_tokens[:, :2048])
print(f"x: {x.shape}")
print(f"output: {output.shape}")

x: torch.Size([1, 2048, 100277])
output: torch.Size([1, 2048, 100277])


In [28]:
predicted_word = encoder.decode([output[:,-1,:].argmax()])
print(predicted_word)

 Markets
