In [None]:
import os
import logging
import warnings
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
from tqdm.auto import tqdm
import random
import numpy as np

from config import training_parameters

In [None]:
# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")


In [None]:
logger.info("Initializing parameters from config...")

# Load config and assign all the keys to variables 
if training_parameters:
    locals().update(training_parameters)
else:
    # Paths and Hyperparameters
    warnings.warn("Custom parameters are not available. Using default values...")
    root_folder = os.path.abspath(os.getcwd())
    DATA_PATH = os.path.join(root_folder, "Datasets/final_result.csv")  # Update this path
    MODEL_NAME = 'gpt2-medium'  # You can choose 'gpt2', 'gpt2-medium', etc.
    OUTPUT_DIR = os.path.join(root_folder, "Models/1.0v_PersonaGPT")
    train_size = 0.6
    MAX_LENGTH = 256
    BATCH_SIZE = 24  
    EPOCHS = 3
    LEARNING_RATE = 5e-5
    WARMUP_RATIO = 5 
    SEED = 42

In [None]:
# Custom Dataset Class
class ConstructDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = self.construct_input(row)
        encoded_dict = self.tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        input_ids = encoded_dict['input_ids'].squeeze()
        attention_mask = encoded_dict['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone(),  # Language modeling objective
        }

    def construct_input(self, row):
        context = row['context']
        question = row['question']
        answer = row['answer']
        # Construct the input in a conversational format
        input_text = f"{context}\nUser: {question}\nAssistant: {answer}"
        return input_text


In [None]:
class CheckpointScheduler():
      def __init__(self, model, optimizer, save_dir, monitor='val_loss', mode='min', save_best_only=True, save_freq=1):
        self.model = model
        self.optimizer = optimizer
        self.save_dir = save_dir
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.save_freq = save_freq
        self.best_metric = None
      
      def save_checkpoint(self, epoch, val_loss=None, val_accuracy=None):
            """Saves the model and optimizer state."""
            state = {
                  'epoch': epoch,
                  'model_state_dict': self.model.state_dict(),
                  'optimizer_state_dict': self.optimizer.state_dict(),
                  'val_loss': val_loss,
                  'val_accuracy': val_accuracy,
            }
            save_path = os.path.join(self.save_dir, f'checkpoint_epoch_{epoch}.pth')
            torch.save(state, save_path)
            print(f'Checkpoint saved at {save_path}')
      
      def step(self, epoch, val_loss=None, val_acc=None):
          """
          Decision maker whether to save the model during training step.
          """
          if epoch % self.save_freq == 0:
              current_metric = val_loss if self.monitor == 'val_loss' else val_acc
              if current_metric is None:
                  return

              if self.save_best_only:
                 if self.best_metric is None:
                     self.best_metric = current_metric
                     self.save_checkpoint(epoch, val_loss, val_acc)
                 else: 
                     improvement = (current_metric > self.best_metric) if self.mode == 'min' else (current_metric < self.best_metric)

                     if improvement:
                         self.best_metric = current_metric
                         self.save_checkpoint(epoch, val_loss, val_acc)
              else:
                  self.save_checkpoint(epoch, val_loss, val_acc)


In [None]:
# Data Loading and Preprocessing Function
def load_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_csv(file_path)
    df = df.sort_values(by='timestamp').reset_index(drop=True)

    train_idx = int(train_size * len(df))
    df_train = df[:train_idx]
    df_val = df[train_idx:]

    return df_train, df_val

# Collate Function for DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    return {
        'input_ids': input_ids.to(device),
        'attention_mask': attention_mask.to(device),
        'labels': labels.to(device),
    }

# Main Function
def main():
    set_seed(SEED)

    # Load and preprocess data
    train_data, val_data = load_data(DATA_PATH)

    # Initialize tokenizer and model
    logger.info("Initializing tokenizer and model...")
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token

    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Prepare dataset and dataloader
    dataset = ConstructDataset(train_data, tokenizer, max_length=MAX_LENGTH)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        pin_memory=True,
    )

    # Set up optimizer and scheduler
    logger.info("Setting up optimizer and scheduler...")
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
    total_steps = len(dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=WARMUP_RATIO, num_training_steps=total_steps
    )
    checkpoint_scheduler = CheckpointScheduler(
    model=model,
    optimizer=optimizer,
    save_dir='./checkpoints',
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    save_freq=1  # Save every epoch if there is an improvement
)

    # Training Loop # TODO: Add checkpoints
    logger.info("Starting training...")
    model.train()
    for epoch in range(EPOCHS):
        logger.info(f"Epoch {epoch + 1}/{EPOCHS}")
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc="Training", leave=False)
        for batch in progress_bar:
                optimizer.zero_grad(set_to_none=True)
                
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels'],
                )

                loss = outputs.loss
                loss.backward()
                optimizer.step()
                scheduler.step()
                checkpoint_scheduler.step(epoch, val_loss=loss.item(), val_acc=None)

                epoch_loss += loss.item()
                progress_bar.set_postfix({'loss': loss.item()})

                avg_epoch_loss = epoch_loss / len(dataloader)
                logger.info(f"Average Epoch Loss: {avg_epoch_loss:.4f}")

    # Save the fine-tuned model
    logger.info(f"Saving model to {OUTPUT_DIR}...")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info("Model saved successfully.")


        

In [None]:
if __name__ == '__main__':
      main()