In [1]:
import json
import os
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [10]:
class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_seq_len=768):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        prompt = entry["prompt"]
        completion = entry["completion"]

        # Combine prompt and completion into a single input sequence
        input_text = f"{prompt} {self.tokenizer.sep_token} {completion}"

        # Tokenize and truncate input sequence
        input_ids = self.tokenizer.encode(input_text, max_length=self.max_seq_len, truncation=True)

        return torch.tensor(input_ids)

In [4]:
def prepare_dataset(dataset_name, file_path, tokenizer):
    # Check if the dataset file already exists
    if not os.path.exists(file_path):
        # Load the dataset from Hugging Face datasets library
        dataset = load_dataset(dataset_name)

        # Save the dataset to a file for future use
        with open(file_path, 'w') as file:
            json.dump(dataset, file)
    else:
        # Load the dataset from the file
        with open(file_path, 'r') as file:
            dataset = json.load(file)

    # Create a CustomDataset from the loaded dataset
    custom_dataset = CustomDataset(file_path, tokenizer)

    # Split the dataset into training, validation, and test sets
    num_samples = len(custom_dataset)
    train_split = int(0.8 * num_samples)
    val_split = int(0.1 * num_samples)
    
    train_dataset = Subset(custom_dataset, list(range(train_split)))
    val_dataset = Subset(custom_dataset, list(range(train_split, train_split + val_split)))
    test_dataset = Subset(custom_dataset, list(range(train_split + val_split, num_samples)))

    return train_dataset, val_dataset, test_dataset

In [15]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [13]:
def train(
    train_dataset, val_dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    save_model_on_epoch=False,
):
    device = torch.device("cuda")
    model = model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")

        # Training loop
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, None, max_seq_len)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (idx + 1) % batch_size == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

        # Validation loop
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for val_entry in tqdm(val_dataloader):
                val_input_tensor = val_entry.to(device)
                val_outputs = model(val_input_tensor, labels=val_input_tensor)
                val_loss += val_outputs[0].item()

        val_loss /= len(val_dataloader)

        print(f"Validation Loss: {val_loss}")

        # Save the model if it's the best so far
        if save_model_on_epoch and val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-best.pt"),
            )

    return model

In [6]:
def evaluate(model, dataloader, device, output_file, tokenizer):
    model.eval()
    total_loss = 0
    detokenized_outputs = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_tensor = batch.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            total_loss += outputs.loss.item()
            
            # Detach the logits before using them for decoding
            logits = outputs.logits.detach()
            detokenized_output = tokenizer.decode(logits[0], skip_special_tokens=True)
            detokenized_outputs.append(detokenized_output)
            output_file.write(detokenized_output + '\n')

    avg_loss = total_loss / len(dataloader)
    return avg_loss, detokenized_outputs

In [16]:
# Define your GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Specify the dataset name and file path
dataset_name = "propmts_and_completions"
file_path = 'prompts_v3.json'
output_file = 'detokenized_outputs.txt'

# Prepare the dataset
train_dataset, val_dataset, test_dataset = prepare_dataset(dataset_name, file_path, tokenizer)

# Train the model
trained_model = train(train_dataset, val_dataset, model, tokenizer)

# Evaluate on the validation set
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
val_loss, detokenized_outputs = evaluate(trained_model, val_dataloader, device=torch.device("cuda"), output_file=output_file, tokenizer=tokenizer)
print(f"Validation Loss: {val_loss}")

# Evaluate on the test set
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
test_loss, detokenized_outputs_test = evaluate(trained_model, test_dataloader, device=torch.device("cuda"), output_file='detokenized_outputs_test.txt', tokenizer=tokenizer)
print(f"Test Loss: {test_loss}")

Training epoch 0


31it [00:04,  6.71it/s]
100%|██████████| 3/3 [00:00<00:00, 13.32it/s]


Validation Loss: 3.1705198287963867
Training epoch 1


31it [00:00, 476.49it/s]
100%|██████████| 3/3 [00:00<00:00, 68.12it/s]


Validation Loss: 3.1705198287963867
Training epoch 2


31it [00:00, 516.19it/s]
100%|██████████| 3/3 [00:00<00:00, 73.10it/s]


Validation Loss: 3.1705198287963867
Training epoch 3


31it [00:00, 499.55it/s]
100%|██████████| 3/3 [00:00<00:00, 68.12it/s]


Validation Loss: 3.1705198287963867
Training epoch 4


31it [00:00, 507.74it/s]
100%|██████████| 3/3 [00:00<00:00, 73.11it/s]


Validation Loss: 3.1705198287963867


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 68.12it/s]


Validation Loss: 3.1705198287963867


Evaluating: 100%|██████████| 5/5 [00:00<00:00, 67.51it/s]

Test Loss: 2.732644510269165



