In [1]:
import os
import time
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GenerationConfig, TrainingArguments, Trainer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [4]:
my_device = "cuda" if torch.cuda.is_available() else "cpu"
print("My Device: {}".format(my_device))

My Device: cuda


In [5]:
# initialize the model
model_name = "HuggingFaceTB/SmolLM2-360M" # HuggingFaceTB/SmolLM2-135M, HuggingFaceTB/SmolLM2-360M, HuggingFaceTB/SmolLM2-1.7B, HuggingFaceTB/SmolLM2-1.7B-Instruct
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(my_device)

In [6]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [7]:
def generate_chat_template(query_text, my_tokenizer, my_device, instruct_model=False):
    # create a system message

    if instruct_model:
        messages = [{"role": "user", "content": query_text}]
        input_text = my_tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = my_tokenizer.encode(input_text, return_tensors="pt").to(my_device)
    else:
        inputs = my_tokenizer.encode(query_text, return_tensors="pt").to(my_device)

    return inputs

def generate_output(my_inputs, my_tokenizer, my_model, max_tokens = 50, temp = 0.3, top_p = 0.9, top_k=50, penalty_score=1.2, do_sample = True, instruct_model=False):

    if instruct_model:
        outputs = my_model.generate(my_inputs, max_new_tokens=max_tokens, temperature=temp, top_p=top_p, top_k=top_k, repetition_penalty=penalty_score, do_sample=do_sample)
        output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_output_text = output_text.split("<|im_start|>assistant")[1].split("<|im_end|>")[0].strip()
    else:
         outputs = my_model.generate(my_inputs, max_new_tokens=max_tokens, temperature=temp, top_p=top_p, top_k=top_k, repetition_penalty=penalty_score, do_sample=do_sample,
                                     eos_token_id=my_tokenizer.eos_token_id)
         cleaned_output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return cleaned_output_text

In [None]:
def tune_generate_chat_template(query_text, my_tokenizer, my_device, instruct_model=False):
    """
    Formats the query based on the instruction tuning prompt template.
    """

    # Apply the updated instruction-tuned prompt template
    formatted_prompt = f"""
    ### Instruction:
    You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

    ### Customer Request:
    {query_text}

    ### Response:
    """

    if instruct_model:
        messages = [{"role": "user", "content": formatted_prompt}]
        input_text = my_tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = my_tokenizer.encode(input_text, return_tensors="pt").to(my_device)
    else:
        inputs = my_tokenizer.encode(formatted_prompt, return_tensors="pt").to(my_device)

    return inputs


def tune_generate_output(my_inputs, my_tokenizer, my_model, max_tokens=50, temp=0.3, top_p=0.9, top_k=50, penalty_score=1.2, do_sample=True, instruct_model=False):
    """
    Generates a response from the model based on the input.
    """

    outputs = my_model.generate(
        my_inputs,
        max_new_tokens=max_tokens,
        temperature=temp,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=penalty_score,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=my_tokenizer.eos_token_id
    )

    # Decode output and clean it up
    output_text = my_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure safe parsing without hardcoded token removal
    cleaned_output_text = output_text.strip()

    return cleaned_output_text

In [8]:
sample_query = "Question: What is the capital of Germany?"
sample_inputs = generate_chat_template(sample_query, tokenizer, my_device)
sample_output = generate_output(sample_inputs, tokenizer, model, max_tokens=20, temp = 0.6, top_p = 0.6, top_k=50, penalty_score=1.2, do_sample = True, instruct_model=False)

print(sample_query)
print("="*10)
print(sample_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question: What is the capital of Germany?
Question: What is the capital of Germany?
Answer: Berlin.

2016-08-31 09:


In [10]:
## DATASET ##
huggingface_dataset_name = "bitext/Bitext-retail-banking-llm-chatbot-training-dataset"

ds = load_dataset(huggingface_dataset_name, split='train')

# Split the dataset into train and test
train_test_split = ds.train_test_split(test_size=0.05, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

train_validation_split = train_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = train_validation_split['train']
validation_dataset = train_validation_split['test']

print(ds)

Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 25545
})


In [11]:
sample_idx = 10

sample_query = "Question: " + train_dataset['instruction'][sample_idx]
sample_response = train_dataset['response'][sample_idx]
sample_inputs = generate_chat_template(sample_query, tokenizer, my_device)
sample_output = generate_output(sample_inputs, tokenizer, model, max_tokens=50, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2, do_sample = True, instruct_model=False)

print("="*10)
print(sample_query)
print("="*10)
print(sample_response)
print("="*10)
print(sample_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question: help me disputing a withdrawal
I'm sorry to hear that you need assistance with disputing a withdrawal. I'll do my best to help you with that. Here's what you can do:

1. Check your account statement: Take a look at your recent transactions to gather all the necessary information about the withdrawal, such as the date, time, and location.

2. Contact your bank: Reach out to your bank's customer service department through their helpline or visit the nearest branch. Provide them with the details of the withdrawal and explain why you believe there is an issue. They will guide you further in the dispute process and may require additional information or documentation.

3. File a dispute claim: Your bank will assist you in filing a formal dispute claim. They will investigate the transaction and work towards a resolution. Make sure to provide them with any evidence or supporting documents related to the disputed withdrawal.

4. Monitor your account: Until the dispute is resolved, kee

In [12]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 361821120
all model parameters: 361821120
percentage of trainable model parameters: 100.00%


In [13]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [13]:
## LORA ##
lora_config = LoraConfig(
    r=4, # Rank
    lora_alpha=8,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], #"gate_proj", "up_proj", "down_proj"
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

peft_model = get_peft_model(model, lora_config).to(my_device)

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1572864
all model parameters: 1712949248
percentage of trainable model parameters: 0.09%


In [14]:
## TRY DIFFERENT PROMPTS ##
sample_idx = 100

sample_query = train_dataset['instruction'][sample_idx]


prompt = f"""
### Instruction:
You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

### Customer Request:
{sample_query}

### Response:
"""

inputs = tokenizer.encode(prompt, return_tensors="pt").to(my_device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.6, top_p=0.7, top_k=50, repetition_penalty=1.2, do_sample=True, eos_token_id=tokenizer.eos_token_id)
sample_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

sample_response = train_dataset['response'][sample_idx]

print("="*10)
print(sample_query)
print("="*10)
print(sample_response)
print("="*10)
print(sample_output)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


where do I see the card annul fee?
I'll do my best! I'm here to help you with that. To view the annual fee for your card, you can follow these steps:

1. Log in to your online banking or mobile banking app.
2. Navigate to the "Accounts" or "Credit Cards" section.
3. Look for your specific credit card and click on it.
4. You should find the annual fee listed in the card details or account information.

If you're having trouble locating the annual fee or need further assistance, please don't hesitate to let me know.

### Instruction:
You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

### Customer Request:
where do I see the card annul fee?

### Response:
Thank you for your inquiry! The annual charge on credit cards is typically between $0-$15, but it can vary depending on factors such as interest rates or rewards programs offered by the bank. It would be best if you could check with your


In [16]:
sample_idx = 100

sample_query = train_dataset['instruction'][sample_idx]
sample_response = train_dataset['response'][sample_idx]

sample_inputs = tune_generate_chat_template(sample_query, tokenizer, my_device)
sample_output = tune_generate_output(sample_inputs, tokenizer, model, max_tokens=100, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2, do_sample = True, instruct_model=False)


print("="*10)
print(sample_query)
print("="*10)
print(sample_response)
print("="*10)
print(sample_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


where do I see the card annul fee?
I'll do my best! I'm here to help you with that. To view the annual fee for your card, you can follow these steps:

1. Log in to your online banking or mobile banking app.
2. Navigate to the "Accounts" or "Credit Cards" section.
3. Look for your specific credit card and click on it.
4. You should find the annual fee listed in the card details or account information.

If you're having trouble locating the annual fee or need further assistance, please don't hesitate to let me know.
### Instruction:
    You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

    ### Customer Request:
    where do I see the card annul fee?

    ### Response:
    10% of your credit limit is charged as card annual fee, if you have not used any balance on this account for last 6 months then there will be no charge at all.
"""


In [17]:
def tokenize_function(examples):
    """
    Tokenizes the dataset using the fine-tuning instruction prompt format.
    Handles batched processing by iterating over each example.
    """

    # Construct the full prompt for each example in the batch
    formatted_prompts = [
        f"""
        ### Instruction:
        You are an AI banking assistant. Respond to the customer's request in a clear and professional manner.

        ### Customer Request:
        {instruction}

        ### Response:
        {response}
        """
        for instruction, response in zip(examples["instruction"], examples["response"])
    ]

    # Tokenize each formatted prompt
    tokenized_outputs = tokenizer(
        formatted_prompts,
        padding="max_length",
        truncation=True,
        max_length=512,  # Adjust as needed
    )

    # Return tokenized inputs
    return {
        "input_ids": tokenized_outputs["input_ids"],
        "attention_mask": tokenized_outputs["attention_mask"],
        "labels": tokenized_outputs["input_ids"],  # Causal LM: labels = input_ids
    }

# Tokenizing and processing all dataset splits
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['tags', 'instruction', 'category', 'intent', 'response'])

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['tags', 'instruction', 'category', 'intent', 'response'])

tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(['tags', 'instruction', 'category', 'intent', 'response'])


Map: 100%|██████████| 23053/23053 [00:07<00:00, 3150.32 examples/s]
Map: 100%|██████████| 1278/1278 [00:00<00:00, 3167.92 examples/s]
Map: 100%|██████████| 1214/1214 [00:00<00:00, 3327.65 examples/s]


In [18]:
## now merge the splits into a single dataset format
tokenized_datasets = DatasetDict({
    'train': tokenized_train_dataset,
    'validation': tokenized_validation_dataset,
    'test': tokenized_test_dataset
})

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (23053, 3)
Validation: (1214, 3)
Test: (1278, 3)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23053
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1214
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1278
    })
})


In [19]:
#### PEFT Training ####
output_dir = f'tuned_models_SmolLM2/bank_qa_base_tune_peft-17B-{str(time.strftime("%Y_%m_%d_%H_%M"))}'

## DO IT FALSE, IF THERE ARE ANY VALUABLE MODELS!!!!
if False:
  shutil.rmtree('tuned_models_SmolLM2')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

step_size = 100 # we have 11k in total
lr = 1e-3
n_epochs = 2
batch_size = 8
gradient_acc = 4
warmup_steps = 50 # previously:0

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=False,
    learning_rate=lr,
    num_train_epochs=n_epochs,
    logging_steps=step_size,  # Logs training loss every x.th steps
    save_steps=step_size,
    warmup_steps=warmup_steps,
    logging_strategy="steps",  # Ensures logs are printed every x.th steps
    evaluation_strategy="steps",  # Runs validation after each epoch
    save_strategy="steps",  # Saves model checkpoints after each epoch
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_acc,
    no_cuda=not torch.cuda.is_available(),
    report_to="none"  # Disable WandB
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets['validation']
)

peft_trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
100,1.9986,0.357347
200,0.319,0.305864
300,0.2907,0.286572
400,0.277,0.276604
500,0.2683,0.269064
600,0.2629,0.263973
700,0.2558,0.25923
800,0.2492,0.256608
900,0.2489,0.253669
1000,0.2458,0.251624


TrainOutput(global_step=1440, training_loss=0.38027189373970033, metrics={'train_runtime': 2465.9386, 'train_samples_per_second': 18.697, 'train_steps_per_second': 0.584, 'total_flos': 2.2823158784458752e+17, 'train_loss': 0.38027189373970033, 'epoch': 1.9989590562109645})

In [14]:
#### Load PEFT Model and run for a sentence ####
from peft import PeftModel

In [None]:
sample_idx = 0

sample_query = test_dataset['instruction'][sample_idx]
sample_response = test_dataset['response'][sample_idx]

sample_inputs = tune_generate_chat_template(sample_query, tokenizer, my_device)
sample_output = tune_generate_output(sample_inputs, tokenizer, peft_model, max_tokens=200, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2,
                                     do_sample = True, instruct_model=False)


print("="*10)
print(sample_query)
print("="*10)
print(sample_response)
print("="*10)
print(sample_output)

In [None]:
peft_model_path = 'tuned_models_SmolLM2/bank_qa_base_tune_peft-2025_02_25_22_48_1e-3_expanded_warmup50/checkpoint-1440/'
peft_model = PeftModel.from_pretrained(model, peft_model_path).to(my_device)
peft_model.eval();

In [23]:
sample_idx = 0

sample_query = test_dataset['instruction'][sample_idx]
sample_response = test_dataset['response'][sample_idx]

sample_inputs = tune_generate_chat_template(sample_query, tokenizer, my_device)
sample_output = tune_generate_output(sample_inputs, tokenizer, peft_model, max_tokens=200, temp = 0.7, top_p = 0.6, top_k=50, penalty_score=1.2,
                                     do_sample = True, instruct_model=False)


print("="*10)
print(sample_query)
print("="*10)
print(sample_response)
print("="*10)
print(sample_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


I want to recover a swallowed card by an ATM, will you help me?
I'm here to help you recover your swallowed card from the ATM. I understand how frustrating and worrisome it can be when your card gets stuck in the machine. Here's what you can do to retrieve your card:

1. Remain calm and don't panic. This situation is not uncommon, and there are steps you can take to resolve it.
2. Contact your bank or credit card issuer immediately. They will be able to assist you further and guide you through the process of recovering your card.
3. Provide them with all the necessary details such as the location of the ATM, the date and time of the incident, and any other relevant information they may require.
4. Your bank will initiate the retrieval process by contacting the ATM service provider. They will work together to safely retrieve your card and return it to you.
5. Keep in mind that the process may take some time, depending on various factors, such as the availability of technicians and the s