In [None]:
# pip install python-dotenv datasets

In [None]:
import os
import yaml
import sys
import logging
import warnings
import pandas as pd
import torch
import torch.multiprocessing as mp
#import torch_xla.distributed.xla_multiprocessing as xmp
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, EvalPrediction
from transformers import TrainingArguments, Trainer
from transformers import (
    HfArgumentParser,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from tqdm.auto import tqdm
from datasets import Dataset
import dataclasses
from dataclasses import dataclass, field
from typing import Dict, List, Optional

In [None]:
with open('config.yaml', 'r') as f:
    full_config = yaml.safe_load(f)

training_params = full_config.get('training_parameters', {})

ROOT_PATH = os.path.dirname(os.getcwd())
DATA_PATH                   = os.path.join(ROOT_PATH, training_params.get("DATA_PATH"))
TRAIN_PATH                  = os.path.join(ROOT_PATH, training_params.get("TRAIN_PATH"))
VAL_PATH                    = os.path.join(ROOT_PATH, training_params.get("VAL_PATH"))
OUTPUT_DIR                  = os.path.join(ROOT_PATH, training_params.get("OUTPUT_DIR", "Models/PersonaGPT"))
OVERWRITE_OUTPUT_DIR        = training_params.get("OVERWRITE_OUTPUT_DIR", True)
MODEL_NAME                  = training_params.get("MODEL_VERSION", "google/mt5-base")
TRAIN_SIZE                  = training_params.get("TRAIN_SIZE", 0.9)
MAX_SEQ_LENGTH              = training_params.get("MAX_SEQ_LENGTH", 256)
TARGET_MAX_LENGTH           = training_params.get("TARGET_MAX_LENGTH", 128)
NUM_TRAIN_EPOCHS            = training_params.get("NUM_TRAIN_EPOCHS", 3)
PER_DEVICE_TRAIN_BATCH_SIZE = training_params.get("PER_DEVICE_TRAIN_BATCH_SIZE", 8)
PER_DEVICE_EVAL_BATCH_SIZE  = training_params.get("PER_DEVICE_EVAL_BATCH_SIZE", 8)
LEARNING_RATE               = training_params.get("LEARNING_RATE", 5e-5)
WARMUP_STEPS                = training_params.get("WARMUP_STEPS", 500)
SEED                        = training_params.get("SEED", 42)
FP16                        = training_params.get("FP16", True)
EVALUATION_STRATEGY         = training_params.get("EVALUATION_STRATEGY", "steps")
EVAL_STEPS                  = training_params.get("EVAL_STEPS", 500)
SAVE_STEPS                  = training_params.get("SAVE_STEPS", 1000)
LOGGING_STEPS               = training_params.get("LOGGING_STEPS", 100)
SAVE_TOTAL_LIMIT            = training_params.get("SAVE_TOTAL_LIMIT", 2)
MAX_LENGTH                  = training_params.get("MAX_LENGTH", 128)
NUM_BEAMS                   = training_params.get("NUM_BEAMS", 5)
DATASET_LANGUAGE            = training_params.get("DATASET_LANGUAGE", "en")
UK_PRONOUNCES               = ['ukrainian', 'ukraine', 'ua', 'ukr', 'uk']

# Dataset configuration

# MT5 Model

In [None]:
# tokenizer.batch_encode_plus

In [None]:
def load_data():
  def prepare_input(row):
      question = row['question']
      context = row['context']
      date = row['date']

      row["input_text"] = f"{date} {c_prompt}: {context} {q_prompt}: {question}"
      row["target_text"] = f"{row['answer']}"
      return row

  def tokenize(batch):
    tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=256)
    tokenized_target = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=128)

    labels = tokenized_target['input_ids']
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

    encodings = {
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': labels
    }

    return encodings


  # Load the model
  tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)
  model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME)
   # Load your CSV data
  df = pd.read_csv(DATA_PATH)

  if UK_PRONOUNCES and DATASET_LANGUAGE.lower() in UK_PRONOUNCES:  # type: ignore
      # Replace the "Time Gap" for ukrainian translation
      df["Context"] = ["Відсутній контекст" if x == "Time Gap" else x for x in df["Context"]]
      q_prompt = "Питання"
      c_prompt = "Контекст"
  else:
      q_prompt = "Question"
      c_prompt = "Context"

  # Create Hugging Face Dataset
  dataset = Dataset.from_pandas(df)
  dataset = dataset.train_test_split(train_size=TRAIN_SIZE)
  train_dataset = dataset['train']
  valid_dataset = dataset['test']
  train_dataset = train_dataset.map(prepare_input)
  valid_dataset = valid_dataset.map(prepare_input, load_from_cache_file=False)

  train_dataset = train_dataset.map(tokenize, batched=True)
  valid_dataset = valid_dataset.map(tokenize, batched=True, load_from_cache_file=False)

  # Set the format for PyTorch/TensorFlow
  columns = ['input_ids', 'attention_mask', 'labels']
  train_dataset.set_format(type='torch', columns=columns)
  valid_dataset.set_format(type='torch', columns=columns)

  # Save the dataset directly for training
  torch.save(train_dataset, TRAIN_PATH)
  torch.save(valid_dataset, VAL_PATH)
  col_to_remove = [col for col in train_dataset.column_names if col not in columns]
  train_dataset = train_dataset.remove_columns(col_to_remove)
  train_dataset = valid_dataset.remove_columns(col_to_remove)



In [None]:
load_data()

# Setting up the training

In [None]:
logger = logging.getLogger(__name__)

# Collator class for our forward method
@dataclass
class T2TDataCollator(DataCollatorForSeq2Seq):
    def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        target_ids = torch.stack([example['target_ids'] for example in batch])

        labels = target_ids.clone()
        labels[labels == 0] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default=TRAIN_PATH if TRAIN_PATH else 'Datasets/train_dataset.pt', # TODO: remove space
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: Optional[str] = field(
        default=VAL_PATH if VAL_PATH else 'Datasets/valid_dataset.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=MAX_LENGTH ,
        metadata={"help": "Max input length for the source text"},
    )
    target_max_len: Optional[int] = field(
        default=TARGET_MAX_LENGTH,
        metadata={"help": "Max input length for the target text"},
    )


In [None]:
# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def main():
  # Parse our parameters
  parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
  # Load the parameters form json file. Make sure you have it.
  model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

  # Set up logging
  logging.basicConfig(
      format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
      datefmt='%Y-%m-%d %H:%M:%S',
      level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARNING,
  )
  logger = logging.getLogger(__name__)

  # If saved model && ouput already exists
  if (
      os.path.exists(training_args.output_dir)
      and os.listdir(training_args.output_dir)
      and training_args.do_train
      and not training_args.overwrite_output_dir
  ):
      raise ValueError(
          f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
      )
  # Check for GPU availability
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  logger.info(f"Using device: {device}")
  set_seed(SEED)

  # Load the model
  tokenizer = MT5Tokenizer.from_pretrained(model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path)
  model = MT5ForConditionalGeneration.from_pretrained(model_args.model_name_or_path,
                                                      cache_dir=model_args.cache_dir if model_args.cache_dir else None)

  # Load the dataset
  logger.info(f"Loading datasets...")
  if not os.path.exists(TRAIN_PATH) and not os.path.exists(TRAIN_PATH): # TODO: Fix
    logger.info(f"Loading datasets...")
    load_data()

  train_dataset = torch.load(data_args.train_file_path)
  valid_dataset = torch.load(data_args.valid_file_path)

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=valid_dataset,
      data_collator=T2TDataCollator(tokenizer=tokenizer),
      #prediction_loss_only=True # TODO: Remove?
  )

  # Training
  if training_args.do_train:
      logger.info("*** Train ***")
      trainer.train()
      trainer.save_model()
      # Save tokenizer for huggingface
      if trainer.is_world_master():
          tokenizer.save_pretrained(training_args.output_dir)

  # Evaluation
  results = {}
  if training_args.do_eval and training_args.local_rank in [-1, 0]:
      logger.info("*** Evaluate ***")

      eval_output = trainer.evaluate()

      output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
      with open(output_eval_file, "w") as writer:
          logger.info("***** Eval results *****")
          for key in sorted(eval_output.keys()):
              logger.info("  %s = %s", key, str(eval_output[key]))
              writer.write("%s = %s\n" % (key, str(eval_output[key])))

      results.update(eval_output)

  return results

def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()

In [None]:
import json

args_dict = {
  "model_name_or_path": MODEL_NAME,
  "max_len": MAX_LENGTH ,
  "target_max_len": TARGET_MAX_LENGTH,
  "output_dir": OUTPUT_DIR,
  "overwrite_output_dir": OVERWRITE_OUTPUT_DIR,
  "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
  "per_gpu_eval_batch_size": PER_DEVICE_EVAL_BATCH_SIZE,
  "gradient_accumulation_steps": 4,
  "learning_rate": LEARNING_RATE,
  "tpu_num_cores": 8,
  "num_train_epochs": NUM_TRAIN_EPOCHS,
  "do_train": True
}

with open('args.json', 'w') as f:
  json.dump(args_dict, f)

In [None]:
main()

In [None]:
#xmp.spawn(_mp_fn, args=(), nprocs=8, start_method='fork')

	•	Metrics to consider:
	•	BLEU: For text similarity.
	•	ROUGE: For overlap of phrases.
	•	Perplexity: For model confidence.

## Generating using MT5


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Specify the path to your saved model directory
model_path = 'Models/1.1v_PersonaGPT'  # Replace with your actual path if different

# Load the tokenizer and model
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

In [None]:
def generate(model, question, context, max_length=200, kwargs=None):
      device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
      model.to(device)

      input_text = f"{q_prompt}: {question} {c_prompt}: {context}" if pd.notnull(context) else f"{q_prompt}: {question}"
      input_ids = tokenizer.encode(input_text, return_tensors='pt')

      input_ids = input_ids.to(device)
      with torch.no_grad():
            output_ids = model.generate(
                  input_ids=input_ids,
                  max_length=MAX_LENGTH,           # Maximum length of the generated answer
                  num_beams=NUM_BEAMS,             # Beam search for better results
                  early_stopping=True
            )
      # Decode the generated IDs to text
      answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

      # Print the answer
      print("Answer:", answer)

question = "що ти там?"
context = "Time Gap"
generate(model, question, context)