In [1]:
import torch
from tqdm.auto import tqdm
from gptmodel import GPTModel
import numpy as np
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
if torch.cuda.is_available():
   device = torch.device("cuda")
# elif torch.backends.mps.is_available():
#    device = torch.device("mps")
else:
   device = torch.device("cpu")

print(f"Using {device} device.")

Using cpu device.


In [13]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

In [4]:
class GPTDataset(Dataset):
    def __init__(self, data, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(data) - max_length, stride):
            input_chunk = data[i:i + max_length]
            target_chunk = data[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [5]:
def create_dataloader(data, batch_size=4, max_length=256,
                      stride=128, shuffle=True, 
                      drop_last=True, num_workers=0):
    # Create dataset
    dataset = GPTDataset(data, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers)

    return dataloader

In [6]:
torch.manual_seed(123)

# Memory-map the training dataset file into numpy array (uint32).
train_data = np.memmap('../preprocess_data/train.bin', dtype=np.uint32, mode='r')

# Memory-map the validation dataset file.
val_data = np.memmap('../preprocess_data/validation.bin', dtype=np.uint32, mode='r')

train_loader = create_dataloader(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [8]:
model = GPTModel(GPT_CONFIG_124M)

In [14]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    # Cast input_batch and target_batch to torch.long before passing to the model and loss function
    logits = model(input_batch.long())
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.long().flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [15]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")


    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [16]:
# Note:
# Uncomment the following code to calculate the execution time
# import time
# start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 1
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5
)

Ep 1 (Step 000000): Train loss 5.730, Val loss 5.757
Ep 1 (Step 000005): Train loss 4.238, Val loss 4.197
Ep 1 (Step 000010): Train loss 3.572, Val loss 3.528
Ep 1 (Step 000015): Train loss 3.244, Val loss 3.209
Ep 1 (Step 000020): Train loss 3.268, Val loss 3.150
Ep 1 (Step 000025): Train loss 3.240, Val loss 3.133
Ep 1 (Step 000030): Train loss 3.221, Val loss 3.103
Ep 1 (Step 000035): Train loss 3.193, Val loss 3.089
Ep 1 (Step 000040): Train loss 3.148, Val loss 3.079
Ep 1 (Step 000045): Train loss 3.204, Val loss 3.083
Ep 1 (Step 000050): Train loss 3.122, Val loss 3.060
Ep 1 (Step 000055): Train loss 3.194, Val loss 3.049
Ep 1 (Step 000060): Train loss 3.130, Val loss 3.037
Ep 1 (Step 000065): Train loss 3.182, Val loss 3.036
Ep 1 (Step 000070): Train loss 3.093, Val loss 3.014
Ep 1 (Step 000075): Train loss 3.122, Val loss 2.995
Ep 1 (Step 000080): Train loss 3.107, Val loss 2.999
Ep 1 (Step 000085): Train loss 3.089, Val loss 3.006
Ep 1 (Step 000090): Train loss 3.080, Val loss

KeyboardInterrupt: 