In [None]:
"""
Fine-tuning GPT-2 on a custom dataset to create a virtual clone.
Author: Bohdan Chuprynka
Start Date: 2024-11-08
Finish Date: 2024-__-__
"""

import os
import logging
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
from tqdm.auto import tqdm
import random
import numpy as np

In [None]:
# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

set_seed(42)

In [None]:
# Custom Dataset Class
class ConstructDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = self.construct_input(row)
        encoded_dict = self.tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        input_ids = encoded_dict['input_ids'].squeeze()
        attention_mask = encoded_dict['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone(),  # Language modeling objective
        }

    def construct_input(self, row):
        context = row['context']
        question = row['question']
        answer = row['answer']
        # Construct the input in a conversational format
        input_text = f"{context}\nUser: {question}\nAssistant: {answer}"
        return input_text


In [None]:
# Data Loading and Preprocessing Function
def load_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_csv(file_path)
    df = df.sort_values(by='timestamp').reset_index(drop=True)

    train_size = int(0.8 * len(df))
    df_train = df[:train_size]
    df_val = df[train_size:]

    logger.info(f"Dataset loaded. Train size: {len(df_train)}, Val size: {len(df_val)}")

    return df_train, df_val

# Collate Function for DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    return {
        'input_ids': input_ids.to(device),
        'attention_mask': attention_mask.to(device),
        'labels': labels.to(device),
    }

# Main Function
def main():
    # Paths and Hyperparameters
    root_folder = os.path.abspath(os.getcwd())
    DATA_PATH = os.path.join(root_folder, "Datasets/final_result.csv")  # Update this path
    MODEL_NAME = 'gpt2-medium'  # You can choose 'gpt2', 'gpt2-medium', etc.
    OUTPUT_DIR = os.path.join(root_folder, "Models/1.0v_PersonaGPT")
    MAX_LENGTH = 256
    BATCH_SIZE = 32  # Adjusted for potential GPU memory constraints
    EPOCHS = 3
    LEARNING_RATE = 5e-5
    WARMUP_RATIO = 3 # after (total_steps/3) steps stop warmup
    SEED = 42

    set_seed(SEED)

    # Load and preprocess data
    train_data, val_data = load_data(DATA_PATH)

    # Initialize tokenizer and model
    logger.info("Initializing tokenizer and model...")
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token

    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Prepare dataset and dataloader
    dataset = ConstructDataset(train_data, tokenizer, max_length=MAX_LENGTH)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        pin_memory=True,
    )

    # Set up optimizer and scheduler
    logger.info("Setting up optimizer and scheduler...")
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
    total_steps = len(dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=WARMUP_RATIO, num_training_steps=total_steps
    )

    # Training Loop
    logger.info("Starting training...")
    model.train()
    for epoch in range(EPOCHS):
        logger.info(f"Epoch {epoch + 1}/{EPOCHS}")
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc="Training", leave=False)
        for batch in progress_bar:
            optimizer.zero_grad()

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels'],
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_epoch_loss = epoch_loss / len(dataloader)
        logger.info(f"Average Epoch Loss: {avg_epoch_loss:.4f}")

    # Save the fine-tuned model
    logger.info(f"Saving model to {OUTPUT_DIR}...")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info("Model saved successfully.")

if __name__ == '__main__':
    main()

more options: gradient accumulation, mixed-precision training (with torch.cuda.amp), or using smaller model variants.