In [None]:
import os
import logging
import warnings
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,


    get_linear_schedule_with_warmup,
)
from tqdm.auto import tqdm
import torch.multiprocessing as mp
import random
import numpy as np

from config import training_parameters

# Attempt 2 
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset


In [None]:
# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")


In [None]:
# Load config and assign all the keys to variables
if training_parameters:
    logger.info("Initializing parameters from config...")
    locals().update(training_parameters)
    uk_pronounces = ['ukrainian', 'ukraine', 'ua', 'ukr']
    if dataset_language in uk_pronounces:  # type: ignore
        q_prompt = "Питання"
        c_prompt = "Контекст"
    else:
        q_prompt = "Question"
        c_prompt = "Context"
        
else:
    # Paths and Hyperparameters
    warnings.warn("Custom parameters are not available. Using default values...")
    dataset_language = "uk"
    root_folder = os.path.abspath(os.getcwd())
    DATA_PATH = os.path.join(root_folder, "Datasets/final_result.csv")  # Update this path
    MODEL_NAME = 'gpt2-medium'  # You can choose 'gpt2', 'gpt2-medium', etc.
    OUTPUT_DIR = os.path.join(root_folder, "Models/1.0v_PersonaGPT")
    TRAIN_SIZE = 0.9
    MAX_LENGTH = 256
    BATCH_SIZE = 24
    EPOCHS = 3
    LEARNING_RATE = 5e-5
    SEED = 42

    # Training !
    NUM_TRAIN_EPOCHS=3,
    PER_DEVICE_TRAIN_BATCH_SIZE=8,
    PER_DEVICE_EVAL_BATCH_SIZE=8,
    EVAL_STEPS=500,
    SAVE_STEPS=1000,
    WARMUP_STEPS=500,
    EVALUATION_STRATEGY='STEPS',
    LOGGING_DIR='MODELS/LOGS',
    LOGGING_STEPS=100,
    SAVE_TOTAL_LIMIT=2,
    FP16=True,
    MAX_LENGTH=128,           # Maximum length of the generated answer
    NUM_BEAMS=5,              # Beam search for better results

# Dataset configuration 

In [None]:
# Load your CSV data
df = pd.read_csv(DATA_PATH)

if dataset_language in uk_pronounces:  # type: ignore
      # Replace the "Time Gap" for ukrainian translation
      df["Context"] = ["Відсутній контекст" if x == "Time Gap" else x for x in df["Context"]]

# MT5 Model

In [None]:
# Load the model 
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Combine question and context
def prepare_input(row):
    question = row['question']
    context = row['context']
    return f"Question: {question} Context: {context}" if pd.notnull(context) else f"Question: {question}"

df['input_text'] = df.apply(prepare_input, axis=1)
df['target_text'] = df['answer']

# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])
dataset = dataset.train_test_split(test_size=(1-TRAIN_SIZE))
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=256)
    tokenized_target = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=128)

    batch['input_ids'] = tokenized_input.input_ids
    batch['attention_mask'] = tokenized_input.attention_mask
    batch['labels'] = tokenized_target.input_ids
    return batch

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set the format for PyTorch/TensorFlow
columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    warmup_steps=WARMUP_STEPS,
    evaluation_strategy=EVALUATION_STRATEGY,
    logging_dir=LOGGING_DIR,
    logging_steps=LOGGING_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
   fp16=FP16 if torch.cuda.is_available() else False
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

	•	Metrics to consider:
	•	BLEU: For text similarity.
	•	ROUGE: For overlap of phrases.
	•	Perplexity: For model confidence.

## Generating using MT5


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Specify the path to your saved model directory
model_path = 'Models/1.1v_PersonaGPT'  # Replace with your actual path if different

# Load the tokenizer and model
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

In [None]:
def generate(model, question, context, max_length=200, kwargs=None):
      device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
      model.to(device)

      input_text = f"{q_prompt}: {question} {c_prompt}: {context}" if pd.notnull(context) else f"{q_prompt}: {question}"
      input_ids = tokenizer.encode(input_text, return_tensors='pt')

      input_ids = input_ids.to(device)
      with torch.no_grad():
            output_ids = model.generate(
                  input_ids=input_ids,
                  max_length=MAX_LENGTH,           # Maximum length of the generated answer
                  num_beams=NUM_BEAMS,             # Beam search for better results
                  early_stopping=True
            )
      # Decode the generated IDs to text
      answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

      # Print the answer
      print("Answer:", answer)

question = "що ти там?"
context = "Time Gap"
generate(model, question, context)

In [None]:
def batch_generate(model, questions, contexts, max_length=MAX_LENGTH):
      device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
      model.to(device)

      input_texts = []
      for question, context in zip(questions, contexts):
            if context:
                  input_text = f"{q_prompt}: {question} {c_prompt}: {context}"
            else:
                  input_text = f"{q_prompt}: {question}"
            input_texts.append(input_text)
            input_ids = input_ids.to(device)

      # Tokenize the input texts
      input_encodings = tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt')
      input_ids = input_encodings.input_ids.to(device)
      attention_mask = input_encodings.attention_mask.to(device)

      # Generate outputs
      with torch.no_grad():
            output_ids = model.generate(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  max_length=128,
                  num_beams=5,
                  early_stopping=True
            )

      # Decode outputs
      answers = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]

      # Print the answers
      for question, answer in zip(questions, answers):
            print(f"Питання: {question}")
            print(f"Відповідь: {answer}")
            print("-" * 50)

questions = [
    "Яка столиця України?",
    "Хто написав 'Кобзар'?",
]
contexts = [
    "Україна розташована в Східній Європі та має багату історію.",
    "Тарас Шевченко був українським поетом і художником.",
]
generate(model, questions, contexts)

# ChatGPT Section 

In [None]:
# Custom Dataset Class
class ConstructDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = self.construct_input(row)
        encoded_dict = self.tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        input_ids = encoded_dict['input_ids'].squeeze()
        attention_mask = encoded_dict['attention_mask'].squeeze()

        to_return = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone(),  # Language modeling objective
        }
        return to_return

    def construct_input(self, row):
        context = row['context']
        question = row['question']
        answer = row['answer']
        # Construct the input in a conversational format
        input_text = f"{context}\nUser: {question}\nAssistant: {answer}"
        return input_text


In [None]:
class CheckpointScheduler():
      def __init__(self, model, optimizer, save_dir, monitor='val_loss', mode='min', save_best_only=True, save_freq=1):
        """
        Saves the model during training.
        Do not schedule train and val metric simultaneously.
        """
        self.model = model
        self.optimizer = optimizer
        self.save_dir = save_dir
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.save_freq = save_freq
        self.best_metric = None
        self.best_val_metric = None

      def save_checkpoint(self, epoch, val_loss=None, val_accuracy=None, save_files: str = None):
            """Saves the model and optimizer state."""
            state = {
                  'epoch': epoch,
                  'model_state_dict': self.model.state_dict(),
                  'optimizer_state_dict': self.optimizer.state_dict(),
                  'val_loss': val_loss,
                  'val_accuracy': val_accuracy,
            }
            if save_files=="train":
              save_path = os.path.join(self.save_dir, f'checkpoint_epoch_{epoch}.pth')
              torch.save(state, save_path)
              print(f'Checkpoint saved at {save_path}')
            elif save_files=="val":
              save_path = os.path.join(self.save_dir, f'val_checkpoint_epoch_{epoch}.pth')
              torch.save(state, save_path)
              print(f'Checkpoint saved at {save_path}')
            else:
              raise ValueError("Invalid save_files value. Must be 'train' or 'val'.")

      def step(self, epoch, val_loss=None, val_acc=None, save_files: str = None):
          """
          Decision maker whether to save the model during training step.

          save_files: 'train' or 'val'
          """
          current_metric = val_loss if self.monitor == 'val_loss' else val_acc
          if current_metric is None:
                return

          if self.save_best_only:
              if self.best_metric is None:
                  self.best_metric = current_metric
              else:
                  improvement = (current_metric < self.best_metric) if self.mode == 'min' else (current_metric < self.best_metric)

                  if improvement:
                      self.best_metric = current_metric
                      self.save_checkpoint(epoch, val_loss, val_acc)
          else:
              self.save_checkpoint(epoch, val_loss, val_acc)

          if save_files:
            self.save_checkpoint(epoch, val_loss, val_acc, save_files)

In [None]:
# Data Loading and Preprocessing Function
def load_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_csv(file_path)
    df = df.sort_values(by='timestamp').reset_index(drop=True)

    """ Since the df data goes sequentially, and we want to train our
    model on the newest data, give the val dataset oldest rows."""
    idx = int((1-TRAIN_SIZE) * len(df))
    df_train = df[idx:]
    df_val = df[:idx]

    return df_train, df_val

# Collate Function for DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    return {
        'input_ids': input_ids.to(device),
        'attention_mask': attention_mask.to(device),
        'labels': labels.to(device),
    }

# Main Function
def main():
    set_seed(SEED)

    # Load and preprocess data
    train_data, val_data = load_data(DATA_PATH)

    # Initialize tokenizer and model
    logger.info("Initializing tokenizer and model...")
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token

    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Prepare dataset and dataloader
    dataset = ConstructDataset(train_data, tokenizer, max_length=MAX_LENGTH)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        pin_memory=False,
    )
    val_dataset = ConstructDataset(val_data, tokenizer, max_length=MAX_LENGTH),
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        pin_memory=False,
    )

    # Set up optimizer and scheduler
    logger.info("Setting up optimizer and scheduler...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
    total_steps = len(dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps
    )
    checkpoint_scheduler = CheckpointScheduler(
    model=model,
    optimizer=optimizer,
    save_dir='./checkpoints',
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    save_freq=1  # Save every epoch if there is an improvement
)

    # Training Loop
    logger.info("Starting training...")
    model.train()
    for epoch in range(EPOCHS):
        logger.info(f"Epoch {epoch + 1}/{EPOCHS}")
        epoch_loss = []
        val_loss = []
        progress_bar = tqdm(dataloader, desc="Training", leave=False)
        for batch_idx, batch in enumerate(progress_bar):
            optimizer.zero_grad(set_to_none=True)

            with torch.autocast(device_type=str(device), dtype=torch.float16):

                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels'],
                )

                loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()
            epoch_loss.append(loss.item())
            mean_loss = np.mean(epoch_loss)
            checkpoint_scheduler.step(epoch, val_loss=mean_loss, val_acc=None) # Remove if using val scheduler
            progress_bar.set_postfix({'loss': mean_loss})

            # Saving model based on val loss
            # if (batch_idx + 1) % 400 == 0: # Regulate frequency if needed.
            #   model.eval()
            #   with torch.no_grad():
            #     for val_batch in val_loader:
            #         with torch.no_grad():
            #             val_outputs = model(
            #                 input_ids=val_batch['input_ids'],
            #                 attention_mask=val_batch['attention_mask'],
            #                 labels=val_batch['labels'],
            #             )
            #             loss = val_outputs.loss
            #             val_loss.append(loss.item())
            #    logger.info(f"Validation Loss: {avg_val_loss:.4f}")
            #    mean_val_loss = np.mean(val_loss)
            #    checkpoint_scheduler.val_step(epoch, val_loss=avg_val_loss, val_acc=None, save_files=True)
            #    progress_bar.set_postfix({'Mean Loss': mean_loss,
            #                              'Mean val loss": mean_val_loss})
            #    model.train()

        logger.info(f"Training Loss: {mean_loss:.4f}")
        checkpoint_scheduler.step(epoch, val_loss=None, val_acc=None, save_files="train") # Remove if using val scheduler



    # Save the fine-tuned model
    logger.info(f"Saving model to {OUTPUT_DIR}...")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    # model.save_pretrained(OUTPUT_DIR)
    # tokenizer.save_pretrained(OUTPUT_DIR)
    torch.save(model.state_dict(), OUTPUT_DIR)
    logger.info("Model saved successfully.")




In [None]:
if __name__ == '__main__':
  main()

# Generating answers using PersonaGPT

In [None]:
# Load the model
root_folder = os.path.abspath(os.getcwd())
MODEL_PATH = os.path.join(root_folder, "PersonaGPT")

#model.load_state_dict(torch.load(model_path))
tokenizer = GPT2Tokenizer.from_pretrained(MODE  L_PATH)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model = model.to(device)

In [None]:
# f"{context}\nUser: {question}\nAssistant: {answer}"

prompt = f"Time Gap\nUser: Як ти там?\nAssistant: "

def generate(model, prompt, max_length=200):
    encoded_dict = tokenizer(
    prompt,
    add_special_tokens=True,
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].squeeze()
    attention_mask = encoded_dict['attention_mask'].squeeze()

    model.eval()
    output_ids = model(
      input_ids=input_ids.to(device),
      attention_mask=attention_mask.to(device),
      labels=input_ids.clone(),
      )

    # Decode and print the generated text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(generated_text)

In [None]:
generate(model, prompt)