In [1]:
import torch
from torch import nn
from models.recurrent_neuron_transformer import RecurrentNeuronTransformer
from tqdm import tqdm
DEVICE = "cuda"


In [2]:
def train_shakespeare_transformer(model, context_window, step_size, data_loader, optimizer, num_epochs, device='cuda', mask=False):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(data_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)

        for batch_idx, (input_chunk, target_chunk) in enumerate(progress_bar):
            # Initialize batch loss
            batch_loss = 0

            # Reset hidden layers at the start of each batch
            hidden_layers = dict()

            for i in range(0, input_chunk.size(1) - context_window, step_size):
                print(f"Chunk starting at position {i} in batch {batch_idx}")

                # Create input and target sequences
                input_seq = input_chunk[:, i:i+context_window].to(device)
                target_seq = target_chunk[:, i+1:i+context_window+1].to(device)

                # Forward pass
                optimizer.zero_grad()
                outputs, hidden_layers = model(inputs=input_seq, hidden_layers=hidden_layers)
                outputs = outputs.view(-1, outputs.size(-1))
                target_seq = target_seq.view(-1)

                # Calculate loss
                loss = nn.CrossEntropyLoss()(outputs, target_seq)
                loss.backward()  # Backpropagate on each loss
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                batch_loss += loss.item()  # Accumulate the scalar loss

            # Update running loss for the epoch
            epoch_loss += batch_loss

            # Update progress bar
            progress_bar.set_postfix(loss=epoch_loss / (batch_idx + 1))

        print(f"Epoch {epoch+1}/{num_epochs} completed. Average batch loss: {epoch_loss / len(data_loader)}")


In [3]:
from transformers import GPT2Tokenizer
from torch.utils.data import TensorDataset, DataLoader
import os
from utils.datasets import TextDataLoader

# Define tokenizer used to convert text to tokens
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

file_path = os.path.join(os.getcwd(), 'data', 'shakespeare', 'tinyshakespeare.txt')
bpe_tokenizer = 'gpt2'
seq_length = 256
batch_size = 10
vocab_size = 50257
data_loader = TextDataLoader(file_path, seq_length, bpe_tokenizer, batch_size, vocab_size)
train_loader, test_loader = data_loader.create_loaders()

In [4]:
# Define the context window size k (defaulting to chunk_length / 2)
context_window = 128

# Define the model
transformer_model = RecurrentNeuronTransformer(input_size=tokenizer.vocab_size, output_size=tokenizer.vocab_size, device=DEVICE, max_length=context_window)
transformer_model.to(DEVICE)
# Define optimizer
transformer_optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

In [5]:
# Define the step size to use for the sliding window
step_size = 127

# Train the model
train_shakespeare_transformer(transformer_model, context_window, step_size, train_loader, 
                               optimizer=transformer_optimizer, num_epochs=2)

Epoch 1/2:   0%|          | 0/27016 [00:00<?, ?it/s]

Chunk starting at position 0 in batch 0
Chunk starting at position 127 in batch 0


                                                    

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.