In [None]:
# For google collab
#pip install python-dotenv datasets evaluate rouge_score
# pip install --upgrade transformers

# import os
# os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### Initial code

In [None]:
import os
import yaml
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
import re
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from datasets import Dataset
from helper_functions import *
from typing import Dict, List

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback,DataCollatorForLanguageModeling

In [None]:
with open('config.yaml', 'r') as f:
    full_config = yaml.safe_load(f)

training_params = full_config.get('training_parameters', {})

ROOT_PATH = find_repository_folder()
DATA_PATH                   = os.path.join(ROOT_PATH, training_params.get("DATA_PATH"))
OUTPUT_DIR                  = os.path.join(ROOT_PATH, training_params.get("OUTPUT_DIR", "Models/PersonaGPT"))
SAVE_STRATEGY               = training_params.get("SAVE_STRATEGY")
OVERWRITE_OUTPUT_DIR        = training_params.get("OVERWRITE_OUTPUT_DIR")
TRAIN_SIZE                  = training_params.get("TRAIN_SIZE")
TARGET_MAX_LENGTH           = training_params.get("TARGET_MAX_LENGTH")
NUM_TRAIN_EPOCHS            = training_params.get("NUM_TRAIN_EPOCHS")
PER_DEVICE_TRAIN_BATCH_SIZE = training_params.get("PER_DEVICE_TRAIN_BATCH_SIZE")
PER_DEVICE_EVAL_BATCH_SIZE  = training_params.get("PER_DEVICE_EVAL_BATCH_SIZE")
LEARNING_RATE               = training_params.get("LEARNING_RATE")
SEED                        = training_params.get("SEED")
EVALUATION_STRATEGY         = training_params.get("EVALUATION_STRATEGY")
EVAL_STEPS                  = training_params.get("EVAL_STEPS")
SAVE_STEPS                  = training_params.get("SAVE_STEPS")
LOGGING_STEPS               = training_params.get("LOGGING_STEPS")
SAVE_TOTAL_LIMIT            = training_params.get("SAVE_TOTAL_LIMIT")
MAX_LENGTH                  = training_params.get("MAX_LENGTH")
DATASET_LANGUAGE            = training_params.get("DATASET_LANGUAGE")
MODEL_PROMPT                = training_params.get("MODEL_PROMPT")
MODEL_NAME = "SherlockAssistant/Mistral-7B-Instruct-Ukrainian"

In [None]:
def change_prompts(language: str = "en", df: pd.DataFrame = None):
    """
    Changes the prompts in context for model requirements based on the language of the dataset.
    language: str: The native language of the dataset, will return the prompts on the specified language. USE language codes!
    df: pd.DataFrame: If specified, will change the column "context" in the dataframe to the specified language.
    """
    if language.lower() == "uk":  # type: ignore
        q_prompt = "Питання"
        c_prompt = "Контекст"
        finetune_prompt = "Підказка"
        context_label = "Відсутній контекст"
    elif language.lower() == "en":
        q_prompt = "Question"
        finetune_prompt = "Prompt"
        c_prompt = "Context"

    if df is not None:
        if not df.empty:
            df["context"] = [context_label if x == "Time Gap" else x for x in df["context"]]
            return df, q_prompt, finetune_prompt, c_prompt

    return q_prompt, finetune_prompt, c_prompt

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_repeated_chars(text):
        return re.sub(r"\s+", " ", text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def strip(text):
        return text.strip()
    
    def lower(text):
        return text.lower()

    return remove_repeated_chars(white_space_fix(strip(lower(s))))

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
      """
      Prepares dataset to be structured for LLM inputs.
      """
      dataframe = df.copy()
      dataframe['question'] = dataframe['question'].astype('string')
      dataframe['context'] = dataframe['context'].astype('string')
      dataframe['question'] = dataframe['question'].str.lower().str.strip()
      dataframe['context'] = dataframe['context'].str.lower().str.strip()
      dataframe['timestamp'] = pd.to_datetime(dataframe["timestamp"], format="mixed")
      return dataframe

In [None]:
df = pd.read_csv(DATA_PATH)
df, q_prompt, p_prompt, c_prompt = change_prompts(language=DATASET_LANGUAGE, df=df)

train_df, val_df = train_test_split(df, train_size=TRAIN_SIZE, random_state=42)
print("Training samples:", len(train_df))
print("Validation samples:", len(val_df)) 

# Mistral

### Preparation

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

def build_input_output(row):
    input_text = (
        f"[{p_prompt}]: {MODEL_PROMPT}"
        f"[{q_prompt}]: {row['question']}\n"
        f"[{c_prompt}]: {row['context']}\n"
    )

    # The label is the ground-truth answer
    output_text = row['answer']
    
    return input_text, output_text 

def preprocess_function(examples):
    inputs, targets = [], []
    for i in range(len(examples['question'])):
        row = {
            'context': examples['context'][i],
            'question': examples['question'][i],
            'answer': examples['answer'][i],
        }
        inp, out = build_input_output(row)
        inputs.append(inp)
        targets.append(out)
    
    tokenized_inputs = tokenizer( 
        inputs,
        max_length=MAX_LENGTH,  # Adjust based on Mistral context window
        truncation=True,
        padding="max_length"
    )
    tokenized_outputs = tokenizer(
        targets,
        max_length=TARGET_MAX_LENGTH, 
        truncation=True,
        padding="max_length"
    )
    
    # For causal language modeling, the labels are the output tokens
    tokenized_inputs["labels"] = tokenized_outputs["input_ids"]
    
    return tokenized_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

### Model Initialization

In [None]:
lora_config = LoraConfig(
    r=8,  # Rank of the update matrices
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM  # Important for auto adapter injection
)
 
model = get_peft_model(model, lora_config)

# Check trainable parameters
trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(f"Trainable params: {trainable_params} | All params: {all_params} | Ratio: {trainable_params/all_params:.2%}")

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,  
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=8,  
    learning_rate=LEARNING_RATE, 
    num_train_epochs=NUM_TRAIN_EPOCHS, 
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=SAVE_TOTAL_LIMIT,
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,
    overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
    bf16=True 
    report_to="wandb"
    )


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # For causal LM tasks, MLM is False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.1)]
)

### Training

In [None]:
trainer.train()

# Save final model into different directory to make it clean
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Model Evaluation & Generation

### Evaluation

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


# TODO: Optimize for auto detection of the folder where model located
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)

In [None]:
model.print_trainable_parameters()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.tokenize import word_tokenize

text = "Це приклад тексту українською мовою."
tokens = word_tokenize(text, language="ukrainian")
print(tokens)

In [None]:
import pymorphy2
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from jiwer import wer, cer


# Initialize the Ukrainian stemmer
morph = pymorphy2.MorphAnalyzer(lang='uk')

# Helper function for stemming Ukrainian text
def stem_text(text: str) -> str:
    words = word_tokenize(text, language="ukrainian")  # Tokenize using nltk
    stemmed_words = [morph.parse(word)[0].normal_form for word in words]  # Stem each word
    return " ".join(stemmed_words)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False)

# Compute metrics function
def compute_metrics(y_true: str, y_pred: str):
    # Apply stemming to both true and predicted text
    y_true_stemmed = stem_text(y_true)
    y_pred_stemmed = stem_text(y_pred)

    # Compute BLEU
    bleu = sentence_bleu([y_true_stemmed.split()], y_pred_stemmed.split())

    # Compute ROUGE
    rouge = scorer.score(y_true_stemmed, y_pred_stemmed)

    # Compute WER
    wer_score = wer(y_true_stemmed, y_pred_stemmed)

    # Compute CER
    cer_score = cer(y_true_stemmed, y_pred_stemmed)

    # Compute Jaccard similarity
    y_true_set = set(y_true_stemmed.split())
    y_pred_set = set(y_pred_stemmed.split())
    jaccard = len(y_true_set & y_pred_set) / len(y_true_set | y_pred_set)

    return {
        "BLEU": bleu,
        "ROUGE-1 Precision": rouge["rouge1"].precision,
        "ROUGE-L Precision": rouge["rougeL"].precision,
        "WER": wer_score,
        "CER": cer_score,
        "Jaccard": jaccard,
    }

# Example usage
y_true = "Це приклад тестового речення"
y_pred = "Це тестове речення для перевірки"
metrics = compute_metrics(y_true, y_pred)

print(metrics)

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from torch.cuda.amp import autocast
from nltk.translate.meteor_score import meteor_score
from jiwer import wer as compute_wer, cer as compute_cer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
from nltk.translate.bleu_score import sentence_bleu
import pymorphy2


def prepare_evaluation_inputs(row) -> str:
    inputs = (
        f"[{p_prompt}]: {MODEL_PROMPT}\n"
        f"[{q_prompt}]: {row['question']}\n"
        f"[{c_prompt}]: {row['context']}\n"
    ) 

    labels = row["answer"]

    return inputs, labels


scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

# Initialize the Ukrainian stemmer
morph = pymorphy2.MorphAnalyzer(lang='uk')

# Helper function for stemming Ukrainian text
def stem_text(text: str) -> str:
    words = word_tokenize(text, language="ukrainian")  # Tokenize using nltk
    stemmed_words = [morph.parse(word)[0].normal_form for word in words]  # Stem each word
    return " ".join(stemmed_words)


def compute_metrics(y_true: str, y_pred: str):
    # Compute BLEU 
    bleu = sentence_bleu([y_true.split()], y_pred.split())

    # Compute ROUGE 
    rouge = scorer.score(y_true, y_pred)

    # Compute METEOR
    meteor = meteor_score([y_true], y_pred)

    # Compute Word Error Rate (WER)
    wer = compute_wer(y_true, y_pred)

    # Compute Character Error Rate (CER)
    cer = compute_cer(y_true, y_pred)

    # Compute Jaccard Similarity
    vectorizer = CountVectorizer(analyzer="word", binary=True)
    y_true_vec = vectorizer.fit_transform([y_true]).toarray()
    y_pred_vec = vectorizer.transform([y_pred]).toarray()

    jaccard = (
        jaccard_score(y_true_vec[0], y_pred_vec[0], average="binary")
        if y_true_vec.shape == y_pred_vec.shape
        else 0
    )

    return bleu, rouge["rouge1"].precision, rouge["rougeL"].precision, meteor, wer, cer, jaccard


def generate_response(inputs):
    inputs = tokenizer(inputs, return_tensors="pt").to(model.device)
    with torch.no_grad(), autocast("cuda"):
        outputs = model.generate(
            **inputs,
            max_new_tokens=TARGET_MAX_LENGTH,
            early_stopping=True,
            do_sample=False,
            penalty_alpha=0.6,
            temperature=0.6, 
            top_k=10, # Default: 50
            top_p=0.8,
            repetition_penalty=0.9,
            low_memory=True # ? Good
        )[0]
    return tokenizer.decode(outputs, skip_special_tokens=True)


# ----------------------MAIN FUNCTION-------------------------------------------
def evaluate(df: pd.DataFrame, samples: int = 10, verbose: bool = False):
    metrics_dict = {"bleu": [], "rouge1": [], "rougeL": [], "meteor": [], "wer": [], "cer": [], "jaccard": []}
    for i in tqdm(range(samples)): 
        random_row = df.sample(n=1)
        inputs, y_true = prepare_evaluation_inputs(random_row)

        y_pred = generate_response(inputs) 

        bleu, rouge1, rougeL, meteor, wer, cer, jaccard = compute_metrics(y_true, y_pred)
        metrics_dict["bleu"].append(bleu)
        metrics_dict["rouge1"].append(rouge1)
        metrics_dict["rougeL"].append(rougeL)
        metrics_dict["meteor"].append(meteor)
        metrics_dict["wer"].append(wer)
        metrics_dict["cer"].append(cer)
        metrics_dict["jaccard"].append(jaccard)

        if verbose: 
            print("-" * 30)
            print(f"({i}) Predicted: {y_pred}")
            print(f"({i}) True: {y_true}")
            print(f"{i}: {[round(np.mean(metric), 2) for metric in metrics_dict.values()]}")
            print("-" * 30)

        return metrics_dict
# ----------------------MAIN FUNCTION-------------------------------------------

In [None]:
df = pd.read_csv(DATA_PATH)
df, q_prompt, p_prompt, c_prompt = change_prompts(language=DATASET_LANGUAGE, df=df)

train_df, val_df = train_test_split(df, train_size=TRAIN_SIZE, random_state=42)
print("Training samples:", len(train_df))
print("Validation samples:", len(val_df)) 

In [None]:
results = evaluate(df=val_df)

### Generation

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


# TODO: Optimize for auto detection of the folder where model located
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)

In [None]:
model.print_trainable_parameters()

In [None]:
def build_inputs(question: str, context=None, prompt=None) -> str:
    inputs = (
        f"[{p_prompt}]: {prompt}\n" 
        f"[{q_prompt}]: {question}\n"
        f"[{c_prompt}]: {context}\n"
    ) 


    return inputs

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=TARGET_MAX_LENGTH,
            early_stopping=True,
            do_sample=False,
            penalty_alpha=0.6,
            temperature=0.6, 
            top_k=10, # Default: 50
            top_p=0.8,
            repetition_penalty=0.9,
            low_memory=True # ? Good
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_prompt = "Доброго дня! У мене є запитання..."

inputs = build_inputs(test_prompt)
print(generate_response())

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
inputs = "Hello! How are you?"

device = "cpu"

model.to(device)

model_inputs = tokenizer([inputs], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs,
                              max_new_tokens=TARGET_MAX_LENGTH,
                                do_sample=True,
                                temperature=0.7, 
                                top_k=50, 
                                num_beams=3,
                                use_cache=True)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
# Help ?