In [13]:
import torch
import torchsummary

from language_models import TransformerLM, configure_optimizers
import time

In [14]:
set_matmul_precision = False
compile_model = False
fused_optim = False
# use_flash_attention = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_type = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()

if cuda_available:
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()

    # Get the type of each GPU
    gpu_types = [torch.cuda.get_device_name(i) for i in range(num_gpus)]

    print("CUDA is available")
    print(f"Number of available GPUs: {num_gpus}")
    print("GPU Types:")
    for i, gpu_type in enumerate(gpu_types):
        print(f"GPU {i}: {gpu_type}")
else:
    print("CUDA is not available")

CUDA is available
Number of available GPUs: 1
GPU Types:
GPU 0: NVIDIA GeForce RTX 4070 Laptop GPU


In [16]:
import tiktoken
data_dir = 'data/shakespeare.txt'


class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        # at init load tokens from disk and store them in memory
        with open(data_dir, 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"loaded {len(self.tokens)} tokens")
        print(f"1 epoch = {len(self.tokens) // (B * T)} batches")

        # state
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # advance the position in the tensor
        self.current_position += B * T
        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

train_loader = DataLoaderLite(B=16, T=1024)

loaded 338024 tokens
1 epoch = 20 batches


In [21]:
model = TransformerLM(
    vocab_size=50257, d_model=768, n_layers=12, n_heads=12, dff=None, activation='relu',
    dropout_rate=0., norm_first=True, max_block_size=1024, bias=False, pos_enc_type='pos_emb')
model = model.to(device)

In [None]:
torchinfo.summary(model)

In [22]:
if set_matmul_precision:
    torch.set_float32_matmul_precision('high')

if compile_model:
    model = torch.compile(model)

In [23]:
loss_fn = torch.nn.CrossEntropyLoss()

# optimizer = configure_optimizers(model, weight_decay=0.1, learning_rate=1e-3)
optimizer = optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, fused=fused_optim)

In [24]:
n_steps = 50
for i in range(n_steps):
    t0 = time.time()
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()

    # with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
    logits, loss = model(x, y)

    loss.backward()
    optimizer.step()
    torch.cuda.synchronize() # wait for the GPU to finish work
    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    tokens_processed = train_loader.B * train_loader.T
    tokens_per_sec = tokens_processed / dt
    print(f"step {i:4d} | loss: {loss.item():.6f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")

In [None]:
# prefix tokens
model.eval()
num_return_sequences = 5
max_length = 30
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
x = tokens.to(device)

# generate! right now x is (B, T) where B = 5, T = 8
# set the seed to 42
torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length:
    # forward the model to get the logits
    with torch.no_grad():
        logits = model(x) # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)