In [1]:
import os
import time
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

import evaluate

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
my_device = "cuda" if torch.cuda.is_available() else "cpu"
print("My Device: {}".format(my_device))

My Device: cuda


In [None]:
# define the evaluation type
is_fine_tune_evaluation = True # If its False, then the script uses the base model only
peft_model_path = 'tuned_models_SmolLM2/bank_qa_base_tune_peft-17B-2025_02_25_23_37_1e-3_warmup50/checkpoint-1440'

In [5]:
# initialize the model
model_name = "HuggingFaceTB/SmolLM2-1.7B" # HuggingFaceTB/SmolLM2-135M, HuggingFaceTB/SmolLM2-360M, HuggingFaceTB/SmolLM2-1.7B, HuggingFaceTB/SmolLM2-1.7B-Instruct
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(my_device)

if is_fine_tune_evaluation:
    # initialize the peft model
    print("PEFT Model is loading")
    peft_model = PeftModel.from_pretrained(model, peft_model_path).to(my_device)
    peft_model.eval();

PEFT Model is loading


In [6]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [7]:
def generate_chat_template(query_text, my_tokenizer, my_device, instruct_model=False):
    # create a system message

    if instruct_model:
        messages = [{"role": "user", "content": query_text}]
        input_text = my_tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = my_tokenizer.encode(input_text, return_tensors="pt").to(my_device)
    else:
        inputs = my_tokenizer.encode(query_text, return_tensors="pt").to(my_device)

    return inputs

def generate_output(my_inputs, my_tokenizer, my_model, max_tokens = 50, temp = 0.3, top_p = 0.9, top_k=50, penalty_score=1.2, do_sample = True, instruct_model=False):

    if instruct_model:
        outputs = my_model.generate(my_inputs, max_new_tokens=max_tokens, temperature=temp, top_p=top_p, top_k=top_k, repetition_penalty=penalty_score, do_sample=do_sample)
        output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_output_text = output_text.split("<|im_start|>assistant")[1].split("<|im_end|>")[0].strip()
    else:
         outputs = my_model.generate(my_inputs, max_new_tokens=max_tokens, temperature=temp, top_p=top_p, top_k=top_k, repetition_penalty=penalty_score, do_sample=do_sample,
                                     eos_token_id=my_tokenizer.eos_token_id)
         cleaned_output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return cleaned_output_text

In [8]:
def tune_generate_chat_template(query_text, my_tokenizer, my_device, instruct_model=False):
    """
    Formats the query based on the instruction tuning prompt template.
    """

    # Apply the updated instruction-tuned prompt template
    formatted_prompt = f"""
    ### Instruction:
    You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

    ### Customer Request:
    {query_text}

    ### Response:
    """

    if instruct_model:
        messages = [{"role": "user", "content": formatted_prompt}]
        input_text = my_tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = my_tokenizer.encode(input_text, return_tensors="pt").to(my_device)
    else:
        inputs = my_tokenizer.encode_plus(formatted_prompt, return_tensors="pt").to(my_device)

    return inputs


def tune_generate_output(my_inputs, my_tokenizer, my_model, max_tokens=50, temp=0.3, top_p=0.9, top_k=50, penalty_score=1.2, do_sample=True, instruct_model=False):
    """
    Generates a response from the model based on the input.
    """

    outputs = my_model.generate(
                
        input_ids=my_inputs["input_ids"],  # Pass input_ids
        attention_mask=my_inputs["attention_mask"],  # Pass attention mask
        max_new_tokens=max_tokens,
        temperature=temp,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=penalty_score,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=my_tokenizer.eos_token_id
    )

    # Decode output and clean it up
    output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure safe parsing without hardcoded token removal
    cleaned_output_text = output_text.strip()

    return cleaned_output_text

In [9]:
## DATASET ##
huggingface_dataset_name = "bitext/Bitext-retail-banking-llm-chatbot-training-dataset"

ds = load_dataset(huggingface_dataset_name, split='train')

# Split the dataset into train and test
train_test_split = ds.train_test_split(test_size=0.05, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

train_validation_split = train_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = train_validation_split['train']
validation_dataset = train_validation_split['test']

print(train_dataset, validation_dataset, test_dataset)

Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 23053
}) Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 1214
}) Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 1278
})


In [10]:
# Batch Size
batch_size = 128  

# PyTorch Dataset Wrapper (Fixes "list object has no attribute 'to'")
class BankingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.dataset[idx]["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(self.dataset[idx]["attention_mask"], dtype=torch.long),
            "labels": self.dataset[idx].get("response", "N/A")
        }

# Tokenization with padding
def tokenize_function(examples):
    formatted_prompts = [
        f"""
        ### Instruction:
        You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

        ### Customer Request:
        {instruction}

        ### Response:
        """
        for instruction in zip(examples["instruction"])
    ]

    tokenized_outputs = tokenizer(
        formatted_prompts,
        padding="longest",  # Dynamic padding
        truncation=True,
        max_length=512, 
        return_tensors="pt"
    )

    return tokenized_outputs

# Tokenize test dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['tags', 'instruction', 'category', 'intent'])

# Convert to PyTorch Dataset
test_dataset_torch = BankingDataset(tokenized_test_dataset)

# DataLoader (batch işlemi için)
test_dataloader = DataLoader(test_dataset_torch, batch_size=batch_size, shuffle=False)

Map:   0%|          | 0/1278 [00:00<?, ? examples/s]

In [11]:
sample_idx = 100

sample_query = test_dataset['instruction'][sample_idx]
sample_response = test_dataset['response'][sample_idx]

if is_fine_tune_evaluation:
    print("PEFT Model Evaluation")
    sample_inputs = tune_generate_chat_template(sample_query, tokenizer, my_device)
    print("-"*50)
    #print(sample_inputs)

    # peft generation
    sample_output = tune_generate_output(sample_inputs, tokenizer, peft_model, max_tokens=300, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2,
                                        do_sample = True, instruct_model=False)
    sample_output_wout_prompt = sample_output.split("Response:")[1].strip()


    print("="*10)
    print(sample_query)
    print("="*10)
    print(sample_response)
    print("="*10)
    print(sample_output)
    print("="*10)
    print(sample_output_wout_prompt)

else:
    print("Base Model Evaluation")
    print("-"*50)
    """
    sample_inputs = tokenizer.encode("Answer: {}".format(sample_query), return_tensors="pt").to(my_device)
    sample_output = model.generate(sample_inputs, max_new_tokens=200, temperature=0.7, top_p=0.6, top_k=50, repetition_penalty=1.2, do_sample=True)
    sample_output = tokenizer.decode(sample_output[0])
    """

    sample_inputs = tune_generate_chat_template(sample_query, tokenizer, my_device)

    # base model generation
    sample_output = tune_generate_output(sample_inputs, tokenizer, model, max_tokens=200, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2,
                                        do_sample = True, instruct_model=False)
    sample_output_wout_prompt = sample_output.split("Response:")[1].strip()

    print("="*10)
    print(sample_query)
    print("="*10)
    print(sample_output)
    print("="*10)
    print(sample_output_wout_prompt)

PEFT Model Evaluation
--------------------------------------------------
I need assistyance finding an ATM
I'm here to help you find the nearest ATM in your area. To locate one, you have a few options:

1. You can use a reliable mapping application like Google Maps or Apple Maps on your smartphone. Simply open the app and search for "ATMs near me" or "ATMs in [your location]." It will show you a list of nearby ATMs along with their distance and directions.

2. Another option is to visit your bank's website. Most banks have an ATM locator feature on their website where you can enter your location or use your current location to find the nearest ATM belonging to your bank.

3. If you're out and about and don't have access to a smartphone or computer, you can ask locals or nearby businesses for directions to the nearest ATM. They should be able to guide you to the closest one.

Remember to prioritize your safety when searching for an ATM. It's a good practice to use ATMs in well-lit, publ

## Evaluation

In [12]:
# Batched inference
generated_responses = []
human_responses = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch["input_ids"].to(my_device)
        attention_mask = batch["attention_mask"].to(my_device)

        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):  # Mixed precision
            if is_fine_tune_evaluation:
                outputs = peft_model.generate(
                input_ids, 
                attention_mask=attention_mask,
                max_new_tokens=200, 
                temperature=0.7, 
                top_p=0.6, 
                top_k=50, 
                repetition_penalty=1.2,
                do_sample=True
                )
            else:
                outputs = model.generate(
                    input_ids, 
                    attention_mask=attention_mask,
                    max_new_tokens=200, 
                    temperature=0.7, 
                    top_p=0.6, 
                    top_k=50, 
                    repetition_penalty=1.2,
                    do_sample=True
                )

        batch_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Response parsing
        batch_cleaned_outputs = [out.split("Response:")[1].strip() if "Response:" in out else out for out in batch_outputs]

        human_responses.extend(batch["labels"])  # Ground truth
        generated_responses.extend(batch_cleaned_outputs)
    
# Create DataFrame
if is_fine_tune_evaluation:
    df = pd.DataFrame(list(zip(human_responses, generated_responses)), 
                  columns=['human_baseline_responses', 'peft_model_responses'])
else:
    df = pd.DataFrame(list(zip(human_responses, generated_responses)), 
                  columns=['human_baseline_responses', 'base_model_responses'])

100%|██████████| 10/10 [48:11<00:00, 289.11s/it]


In [13]:
df.head(2)

Unnamed: 0,human_baseline_responses,peft_model_responses
0,I'm here to help you recover your swallowed ca...,Sure thing! I'm here to assist you with recove...
1,I'm here to assist you with checking your mort...,I'm here to assist you with checking your mort...


In [14]:
if is_fine_tune_evaluation:
    df.to_csv(peft_model_path.replace('/','__')+".csv", index=False, encoding="utf-8")
else:
    df.to_csv(model_name.replace('/','__')+".csv", index=False, encoding="utf-8")

In [None]:
rouge = evaluate.load('rouge')

evaluation_result = rouge.compute(
    predictions=generated_responses,
    references=human_responses,
    use_aggregator=True,
    use_stemmer=True,
)


if is_fine_tune_evaluation:
    print('PEFT MODEL:')
    print(evaluation_result)
else:
    print('ORIGINAL MODEL:')
    print(evaluation_result)

Using the latest cached version of the module from C:\Users\citak\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--rouge\b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Sat Aug 31 02:06:36 2024) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


PEFT MODEL:
{'rouge1': 0.4926546255851052, 'rouge2': 0.19099911354814342, 'rougeL': 0.2832873698706914, 'rougeLsum': 0.4243427394896141}
