In [1]:
import numpy as np

train_data = np.memmap('../data/train.bin', dtype=np.uint16, mode='r')
val_data = np.memmap('../data/val.bin', dtype=np.uint16, mode='r')

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
block_size = 32
batch_size = 16

In [9]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    
    # Randomly select chunk of text for training
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy(data[i:i+block_size].astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy(data[i+1:i+block_size+1].astype(np.int64)) for i in ix])
    
    if device == 'cuda':
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    
    return x, y

In [10]:
x, y = get_batch('train')

In [11]:
x

tensor([[51074, 14950,  3433, 44758, 34716, 61692, 10520,  4345, 61349, 41060,
            43,     0,   686, 15240, 47569, 61692, 57051, 21346,  8912, 46082,
            43,     0, 27979, 60398, 36841, 61692, 41204,  4495, 30576,    43,
             0,     0],
        [28992, 37126,  2807,    43,     0, 25578,  1461, 46574, 13906, 61692,
         51531,    42, 23150,  5469, 45151,    43,     0, 26688,   112, 32708,
          4800,    43,     0, 16136, 13023,    43,     0,     0, 37651, 64054,
         30871, 61692],
        [    0, 25916, 20450, 54116, 60904, 61692,  7053, 52273,  7365, 20039,
         28585,    43,     0,     0, 39440, 31169,  1778, 16719, 61692, 47588,
         64852,  3124, 63630,    43,     0, 10869, 45115, 21605, 16515, 61692,
         38578, 40827],
        [   43,     0,     0, 28988, 35002, 13872, 61692, 23291,    43,     0,
         34007, 55541, 45623, 61692, 55206, 17933, 50899,    43,     0, 24854,
         28388, 33048, 61692,  6332, 44038, 55785,    43,  

In [13]:
class LayerNorm(nn.Module):
    
    def __init__(self, n_dim, bias) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.ones(n_dim))
        self.bias = nn.Parameter(torch.zeros(n_dim)) if bias else None
    
    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)