In [None]:
import json
import os
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_seq_len=768):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.data = self.load_data('prompts_v2.txt')

    def load_data(self, file_path):
        with open('prompts_v2.txt', 'r') as file:
            data = json.load(file)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        prompt = entry["prompt"]
        completion = entry["completion"]

        # Combine prompt and completion into a single input sequence
        input_text = f"{prompt} {self.tokenizer.sep_token} {completion}"

        # Tokenize and truncate input sequence
        input_ids = self.tokenizer.encode(input_text, max_length=self.max_seq_len, truncation=True)

        return torch.tensor(input_ids)

In [None]:
def prepare_dataset(dataset_name, file_path, tokenizer):
    # Check if the dataset file already exists
    if not os.path.exists(file_path):
        # Load the dataset from Hugging Face datasets library
        dataset = load_dataset(dataset_name)

        # Save the dataset to a file for future use
        with open(file_path, 'w') as file:
            json.dump(dataset, file)
    else:
        # Load the dataset from the file
        with open(file_path, 'r') as file:
            dataset = json.load(file)

    # Create a CustomDataset from the loaded dataset
    custom_dataset = CustomDataset(file_path, tokenizer)

    # Split the dataset into training, validation, and test sets
    num_samples = len(custom_dataset)
    train_split = int(0.8 * num_samples)
    val_split = int(0.1 * num_samples)
    
    train_dataset = Subset(custom_dataset, list(range(train_split)))
    val_dataset = Subset(custom_dataset, list(range(train_split, train_split + val_split)))
    test_dataset = Subset(custom_dataset, list(range(train_split + val_split, num_samples)))

    return train_dataset, val_dataset, test_dataset

In [None]:
def train(
    train_dataset, val_dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    save_model_on_epoch=False,
):
    device = torch.device("cuda")
    model = model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")

        # Training loop
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, None, max_seq_len)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (idx + 1) % batch_size == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

        # Validation loop
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for val_entry in tqdm(val_dataloader):
                val_input_tensor = val_entry.to(device)
                val_outputs = model(val_input_tensor, labels=val_input_tensor)
                val_loss += val_outputs[0].item()

        val_loss /= len(val_dataloader)

        print(f"Validation Loss: {val_loss}")

        # Save the model if it's the best so far
        if save_model_on_epoch and val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-best.pt"),
            )

    return model

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_tensor = batch.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            total_loss += outputs[0].item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
# Define your GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Specify the dataset name and file path
dataset_name = "imdb"
file_path = 'path/to/imdb_dataset.json'

# Prepare the dataset
train_dataset, val_dataset, test_dataset = prepare_dataset(dataset_name, file_path, tokenizer)

# Train the model
trained_model = train(train_dataset, val_dataset, model, tokenizer)

# Evaluate on the validation set
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
val_loss = evaluate(trained_model, val_dataloader, device=torch.device("cuda"))
print(f"Validation Loss: {val_loss}")

# Evaluate on the test set
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
test_loss = evaluate(trained_model, test_dataloader, device=torch.device("cuda"))
print(f"Test Loss: {test_loss}")