In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchvision.transforms import v2
from datasets import load_dataset
import tiktoken

In [2]:
torch.manual_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
dataset = load_dataset("roneneldan/TinyStories")
train = dataset["train"]["text"]

In [4]:
encoder = tiktoken.get_encoding("cl100k_base")

In [5]:
with open("train_data.bin", "wb") as file:
    for i, text in enumerate(train):
        encoding = encoder.encode(text)
        np.array(encoding, dtype=np.int32).tofile(file)
        
        print(f"\rSaving data {(i/len(train))*100:.2f}% complete.", end="")



Saving data 100.00% complete.

In [13]:
train_tokens = torch.tensor(np.fromfile("train_data.bin", dtype=np.int32), dtype=torch.int32).to(device)

In [14]:
print(train_tokens.shape)
print(torch.unique(train_tokens).shape)

torch.Size([451148699])
torch.Size([32339])


In [15]:
d_model  = 128
n_heads  = 4
n_vocab  = encoder.n_vocab
n_layers = 4

In [16]:
print(n_vocab)

100277


In [17]:
# taken from https://pytorch-tutorials-preview.netlify.app/beginner/transformer_tutorial.html
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [36]:
class Transformer(nn.Module):
    def __init__(self, n_vocab: int, d_model: int = 128, n_heads: int = 8, n_layers: int = 4):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab, d_model)
        self.positional_encoding = PositionalEncoding(d_model)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        return x

In [42]:
model = Transformer(n_vocab, d_model, n_heads, n_layers).to(device)
output = model(train_tokens[:1000])
print(output.shape)
print(output)

torch.Size([1000, 1000, 128])
tensor([[[-1.8989,  0.7867,  0.9190,  ..., -0.7403,  1.4137, -0.4363],
         [ 1.2443,  0.4770, -1.3367,  ...,  0.0000, -2.0316,  3.1582],
         [-0.3809,  1.9074,  0.5709,  ...,  0.0331,  0.5109,  1.1419],
         ...,
         [-1.0466,  0.3943,  0.4718,  ...,  1.2156,  0.4435,  0.7726],
         [-0.0000,  1.9956, -0.5812,  ...,  0.0000,  1.4642,  0.4254],
         [-0.1508, -0.2153,  0.6706,  ...,  1.2269, -0.9687,  2.2067]],

        [[-0.9640,  0.2759,  0.0000,  ..., -0.7403,  0.0000, -0.4363],
         [ 0.0000, -0.0338, -0.4903,  ...,  1.4444, -2.0315,  3.1582],
         [ 0.5540,  0.0000,  1.4172,  ...,  0.0331,  0.5110,  1.1419],
         ...,
         [-0.1116, -0.0000,  1.3181,  ...,  1.2156,  0.4437,  0.7726],
         [-0.1924,  1.4848,  0.0000,  ...,  2.0610,  1.4643,  0.4254],
         [ 0.7841, -0.7261,  1.5170,  ...,  1.2269, -0.9685,  2.2067]],

        [[-0.8886, -0.7868,  2.0157,  ..., -0.7403,  1.4140, -0.4363],
         [ 2.25