In [None]:
pip install python-dotenv datasets evaluate rouge_score

In [None]:
import os
import yaml
import sys
import logging
import warnings
import pandas as pd
import torch
import torch.multiprocessing as mp
#import torch_xla.distributed.xla_multiprocessing as xmp
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
import random
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import (
    HfArgumentParser,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EvalPrediction
)
import evaluate
from tqdm.auto import tqdm
from datasets import Dataset
import dataclasses
from dataclasses import dataclass, field
from helper_functions import find_repository_folder
from typing import Dict, List, Optional
import string
import math
import glob

In [None]:
with open('config.yaml', 'r') as f:
    full_config = yaml.safe_load(f)

training_params = full_config.get('training_parameters', {})

ROOT_PATH = find_repository_folder()
DATA_PATH                   = os.path.join(ROOT_PATH, training_params.get("DATA_PATH"))
TRAIN_PATH                  = os.path.join(ROOT_PATH, training_params.get("TRAIN_PATH"))
VAL_PATH                    = os.path.join(ROOT_PATH, training_params.get("VAL_PATH"))
OUTPUT_DIR                  = os.path.join(ROOT_PATH, training_params.get("OUTPUT_DIR", "Models/PersonaGPT"))
SAVE_STRATEGY               = training_params.get("SAVE_STRATEGY", "steps")
OVERWRITE_OUTPUT_DIR        = training_params.get("OVERWRITE_OUTPUT_DIR", True)
MODEL_NAME                  = training_params.get("MODEL_NAME", "google/mt5-base")
TRAIN_SIZE                  = training_params.get("TRAIN_SIZE", 0.9)
MAX_SEQ_LENGTH              = training_params.get("MAX_SEQ_LENGTH", 256)
TARGET_MAX_LENGTH           = training_params.get("TARGET_MAX_LENGTH", 128)
NUM_TRAIN_EPOCHS            = training_params.get("NUM_TRAIN_EPOCHS", 3)
PER_DEVICE_TRAIN_BATCH_SIZE = training_params.get("PER_DEVICE_TRAIN_BATCH_SIZE", 8)
PER_DEVICE_EVAL_BATCH_SIZE  = training_params.get("PER_DEVICE_EVAL_BATCH_SIZE", 8)
LEARNING_RATE               = training_params.get("LEARNING_RATE", 5e-5)
WARMUP_STEPS                = training_params.get("WARMUP_STEPS", 500)
SEED                        = training_params.get("SEED", 42)
FP16                        = training_params.get("FP16", True)
EVALUATION_STRATEGY         = training_params.get("EVALUATION_STRATEGY", "steps")
EVAL_STEPS                  = training_params.get("EVAL_STEPS", 500)
SAVE_STEPS                  = training_params.get("SAVE_STEPS", 1000)
LOGGING_STEPS               = training_params.get("LOGGING_STEPS", 100)
SAVE_TOTAL_LIMIT            = training_params.get("SAVE_TOTAL_LIMIT", 2)
MAX_LENGTH                  = training_params.get("MAX_LENGTH", 128)
NUM_BEAMS                   = training_params.get("NUM_BEAMS", 5)
DATASET_LANGUAGE            = training_params.get("DATASET_LANGUAGE", "en")

In [None]:
# For google collab # TODO: Delete later

LOGGING_STEPS = 5
EVAL_STEPS = 30
WARMUP_STEPS = 30
LOGGING_STEPS = 30
SAVE_STEPS = 5
SAVE_TOTAL_LIMIT = 1

In [None]:
# tokenizer.batch_encode_plus

# Preparing Dataset

In [None]:
def change_prompts(language: str, df: pd.DataFrame = None):
    """
    Changes the prompts in context for MT5 model requirements based on the language of the dataset.
    language: str: The native language of the dataset, will return the prompts on the specified language. USE language codes!
    df: pd.DataFrame: If specified, will change the column "context" in the dataframe to the specified language.
    """
    if language.lower() == "uk":  # type: ignore
        q_prompt = "Питання"
        c_prompt = "Контекст"
        context_label = "Відсутній контекст"
    else:
        q_prompt = "Question"
        c_prompt = "Context"

    if df is not None:
        if not df.empty:
            df["context"] = [context_label if x == "Time Gap" else x for x in df["context"]]
            return df, q_prompt, c_prompt

    return q_prompt, c_prompt

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_repeated_chars(text):
        return re.sub(r"\s+", " ", text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def strip(text):
        return text.strip()
    
    def lower(text):
        return text.lower()

    return remove_repeated_chars(white_space_fix(strip(lower(s))))

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
      """
      Prepares dataset to be structured for LLM inputs.
      """
      dataframe = df.copy()
      dataframe['question'] = dataframe['question'].astype('string')
      dataframe['context'] = dataframe['context'].astype('string')
      dataframe['question'] = dataframe['question'].str.lower().str.strip()
      dataframe['context'] = dataframe['context'].str.lower().str.strip()
      dataframe['timestamp'] = pd.to_datetime(dataframe["timestamp"], format="mixed")
      return dataframe

def load_data():
  def preprocess_function(batch):
    inputs = [f"{c_prompt}: {context} {q_prompt}: {question}"
            for date, context, question in zip(batch['context'], batch['question'])]
    inputs = normalize_answer(inputs)
    targets = batch['answer']

    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=256)
    labels = tokenizer(targets, padding='max_length', truncation=True, max_length=256)

    model_inputs = {key: model_inputs[key] for key in ['input_ids', 'attention_mask']}
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

  # Load the model
  tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)
   # Load your CSV data
  df = pd.read_csv(DATA_PATH)

  # For ukrainian dataset only, changes the prompts for ukrainian version.
  df, q_prompt, c_prompt = change_prompts(language=DATASET_LANGUAGE, df=df)

  # Create Hugging Face Dataset
  dataset = prepare_df(df)
  dataset = Dataset.from_pandas(df)
  dataset = dataset.train_test_split(train_size=TRAIN_SIZE)
  train_dataset = dataset['train']
  valid_dataset = dataset['test']

  train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
  valid_dataset = valid_dataset.map(preprocess_function, batched=True, remove_columns=valid_dataset.column_names, load_from_cache_file=False)

  # Save the dataset directly for training
  torch.save(train_dataset, TRAIN_PATH)
  torch.save(valid_dataset, VAL_PATH)

In [None]:
#load_data()

# Setting up the training

In [None]:
logger = logging.getLogger(__name__)

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default=TRAIN_PATH if TRAIN_PATH else 'Datasets/train_dataset.pt',
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: Optional[str] = field(
        default=VAL_PATH if VAL_PATH else 'Datasets/val_dataset.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=MAX_LENGTH ,
        metadata={"help": "Max input length for the source text"},
    )
    target_max_len: Optional[int] = field(
        default=TARGET_MAX_LENGTH,
        metadata={"help": "Max input length for the target text"},
    )


# Training

In [None]:
# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
perplexity = evaluate.load("perplexity")

def compute_metrics(eval_pred: EvalPrediction):
    """" Performs various matrics evaluations during the training process. Uncomment all necessary metrics if needed """
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # type: ignore

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # BLEU & ROUGE
    bleu_metrics = bleu.compute(predictions=decoded_preds, references=decoded_labels, tokenizer=lambda x: x.split())
    rouge_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels, tokenizer=lambda x: x.split())

    # Perplexity
    if hasattr(eval_pred, 'losses') and eval_pred.losses is not None:
        eval_loss = eval_pred.losses.mean()
        perplexity = math.exp(eval_loss)
    else:
        perplexity = None

    return {
        'bleu': bleu_metrics['bleu'],
        'rouge2': rouge_metrics['rouge2'],
        'perplexity': perplexity
    }

def main():
  parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

  model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

  training_args = Seq2SeqTrainingArguments(
      output_dir=OUTPUT_DIR,
      save_strategy=SAVE_STRATEGY,
      overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
      per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
      per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
      gradient_accumulation_steps=4,
      learning_rate=LEARNING_RATE,
      num_train_epochs=NUM_TRAIN_EPOCHS,
      warmup_steps=WARMUP_STEPS,
      fp16=FP16,
      evaluation_strategy=EVALUATION_STRATEGY,
      eval_steps=EVAL_STEPS,
      save_steps=SAVE_STEPS,
      logging_steps=LOGGING_STEPS,
      save_total_limit=1,
      max_grad_norm=1.0,
      include_inputs_for_metrics=True,
      do_train=True
  )

  # Set up logging
  logging.basicConfig(
      format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
      datefmt='%Y-%m-%d %H:%M:%S',
      level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARNING,
  )
  logger = logging.getLogger(__name__)

  # If saved model && ouput already exists
  if (
      os.path.exists(training_args.output_dir)
      and os.listdir(training_args.output_dir)
      and training_args.do_train
      and not training_args.overwrite_output_dir
  ):
      raise ValueError(
          f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome or change the output directory."
      )
  # Check for GPU availability
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  logger.info(f"Using device: {device}")
  set_seed(SEED)

  # Load the model
  tokenizer = MT5Tokenizer.from_pretrained(model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path)
  model = MT5ForConditionalGeneration.from_pretrained(model_args.model_name_or_path,
                                                      cache_dir=model_args.cache_dir if model_args.cache_dir else None)

  # Load the dataset
  logger.info(f"Loading datasets...")
  if not os.path.exists(TRAIN_PATH) and not os.path.exists(VAL_PATH):
    logger.info(f"Loading datasets...")
    load_data()

  train_dataset = torch.load(data_args.train_file_path)
  valid_dataset = torch.load(data_args.valid_file_path)
  data_collator = DataCollatorForSeq2Seq(
     tokenizer=tokenizer,
     model=model,
     padding=True)
  trainer = Seq2SeqTrainer(
      model=model,
      args=training_args,
      compute_metrics=compute_metrics,
      train_dataset=train_dataset,
      eval_dataset=valid_dataset,
      data_collator=data_collator,
      optimizers=(torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE), None)
  )

  # Training
  if training_args.do_train:
    # Check for available checkpoints; pick the latest one
    if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.resume_from_checkpoint is None:
      logger.info("*** Starting from a checkpoint ***")
      checkpoint_dirs = sorted(
          glob.glob(os.path.join(training_args.output_dir, "checkpoint-*")),
          key=os.path.getmtime,
          reverse=True
      )
      if checkpoint_dirs:
          training_args.resume_from_checkpoint = checkpoint_dirs[0]

    logger.info("*** Train ***")
    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
    trainer.save_model(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)

  # Evaluation
  results = {}
  if training_args.do_eval and training_args.local_rank in [-1, 0]:
      logger.info("*** Evaluate ***")

      eval_output = trainer.evaluate()

      output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
      with open(output_eval_file, "w") as writer:
          logger.info("***** Eval results *****")
          for key in sorted(eval_output.keys()):
              logger.info("  %s = %s", key, str(eval_output[key]))
              writer.write("%s = %s\n" % (key, str(eval_output[key])))

      results.update(eval_output)

  return results


In [None]:
import json

args_dict = {
  "output_dir": OUTPUT_DIR,
  "model_name_or_path": MODEL_NAME,
  "max_len": MAX_LENGTH ,
  "target_max_len": TARGET_MAX_LENGTH,
  "save_strategy": SAVE_STRATEGY,
  "overwrite_output_dir": OVERWRITE_OUTPUT_DIR,
  "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
  "per_device_eval_batch_size": PER_DEVICE_EVAL_BATCH_SIZE,
  "gradient_accumulation_steps": 4,
  "learning_rate": LEARNING_RATE,
  "num_train_epochs": NUM_TRAIN_EPOCHS,
  "do_train": True,
  "warmup_steps": WARMUP_STEPS,
  "fp16": FP16,
  "evaluation_strategy": EVALUATION_STRATEGY,
  "eval_steps": EVAL_STEPS,
  "save_steps": SAVE_STEPS,
  "logging_steps": LOGGING_STEPS,
  "save_total_limit": 1
}

with open('args.json', 'w') as f:
  json.dump(args_dict, f)

In [None]:
#main()

In [None]:
#xmp.spawn(_mp_fn, args=(), nprocs=8, start_method='fork')

# Evaluation

In [None]:
def load_model(save_dir):
      model = MT5ForConditionalGeneration.from_pretrained(save_dir)
      tokenizer = MT5Tokenizer.from_pretrained(save_dir)
      return model, tokenizer

In [None]:
q_prompt, c_prompt = change_prompts(language=DATASET_LANGUAGE)

save_dir = os.path.join(ROOT_PATH, "Models/PersonaGPT")
model, tokenizer = load_model(save_dir)

In [None]:
columns = ['input_ids', 'attention_mask', 'labels']
valid_dataset = torch.load(VAL_PATH)
#valid_dataset = valid_dataset.select(range()) # To limit the amount of samples  
valid_dataset.set_format(type='torch', columns=columns)
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)

In [None]:
answers = []
for batch in tqdm(dataloader):
    outs = model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            max_length=MAX_SEQ_LENGTH,
            num_beams=NUM_BEAMS,
            early_stopping=True
      )
    outs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
    answers.extend(outs)
answers = [re.sub(r"extra_id_\d+", "", out).strip() for out in answers]

## Generating using MT5


In [None]:
""" Constructs the input the same as the training data for outputs accuracy """
def construct_input(question: str, context: str, timestamp = None):
    # Prepare inputs
    if timestamp is None:
        timestamp = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    
    inputs = f"{c_prompt}: {context} {q_prompt}: {question}"
    return inputs

In [None]:
""" Generate a response based question, context, timestamp"""
def generate_response(question: str, context: str, timestamp = None) -> str:
      inputs = construct_input(question=question, context=context, timestamp=timestamp)

      inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=256)
      inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in inputs.items()}

      outs = model.generate(input_ids=inputs['input_ids'],
                            attention_mask=inputs['attention_mask'],
                            max_length=MAX_SEQ_LENGTH,
                            num_beams=NUM_BEAMS,
                            early_stopping=True)[0]
      outs = tokenizer.decode(outs, skip_special_tokens=True)
      return outs

In [None]:
""" Visualization to look at the outputs on dataset examples"""
def get_random_response(df: pd.DataFrame, samples=3) -> str:
      for _ in range(samples):
            idx = random.randint(0, len(df)-1)
            selected_row = df.loc[idx, :]

            question = selected_row["question"]
            context = selected_row["context"]
            timestamp = selected_row["timestamp"]
            
            print("-" * 50)
            print(f"Question: {question}, timestamp: {timestamp}")
            print(f"Response {idx}:\n{generate_response(question, context, timestamp)}")
            print("-" * 50)


df = pd.read_csv(DATA_PATH)
df, q_prompt, c_prompt = change_prompts("uk", df=df)
get_random_response(df, samples=100)

In [None]:
from transformers import MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained("Models/PersonaGPT")

In [None]:
# Example Input
input_text = "нахуя ти вставляєш одну хуйню згенеруй щось нормальне"

# Tokenize the input
encoded_input = tokenizer(
    input_text,
    return_tensors="pt",   # Convert to PyTorch tensors
    max_length=128,        # Adjust max length as needed
    truncation=True        # Truncate if the text is too long
)


outs = model.generate(input_ids=encoded_input['input_ids'],
                    attention_mask=encoded_input['attention_mask'],
                    max_length=MAX_SEQ_LENGTH)[0]
decoded_output = tokenizer.decode(outs, skip_special_tokens=True)

In [None]:
decoded_output