In [None]:
# Install required libraries for fine-tuning a language model
# transformers: Provides pre-trained models, tokenizers, and training utilities
# datasets: Offers easy access to various datasets
# peft: Enables Parameter-Efficient Fine-Tuning (LoRA in this case)
# trl: Provides tools for Reinforcement Learning from Human Feedback (RLHF) and other training methods
# bitsandbytes: Used for 8-bit and 4-bit quantization to reduce memory usage
# accelerate: Helps in easily running training scripts on different hardware setups
!pip install transformers datasets peft trl bitsandbytes accelerate --upgrade

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.20.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia

In [None]:
import os
# Disable Weights & Biases logging to avoid clutter or if not needed
os.environ["WANDB_DISABLED"] = "true"

import torch
# Import necessary classes from the transformers, peft, datasets, and trl libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# [1] MODEL NAME: Specify the base model to be used for fine-tuning
model_name = "microsoft/phi-2"

# [2] LOAD QUANTIZATION CONFIGURATION (saves VRAM):
# Configure 4-bit quantization using BitsAndBytesConfig. This reduces the model's memory footprint,
# allowing larger models to fit into memory or train with larger batch sizes.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",       # Use the NF4 quantization type
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation (recommended for transformers)
    bnb_4bit_use_double_quant=True,  # Use double quantization for further memory savings
)

# [3] LOAD BASE MODEL & TOKENIZER:
# Load the pre-trained model from the specified model_name, applying the quantization configuration.
# trust_remote_code=True is needed for some models like Phi-2.
# device_map="auto" automatically distributes the model layers across available devices (like GPUs).
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

# Load the tokenizer for the specified model.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Set the padding token to the end-of-sequence token and padding side to right.
# This is important for batching sequences of different lengths during training.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# [4] CONFIGURE LoRA:
# Configure LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning.
# LoRA adds small, trainable low-rank matrices to specific layers of the pre-trained model,
# significantly reducing the number of parameters that need to be updated during fine-tuning.
lora_config = LoraConfig(
    r=16,                          # LoRA rank (dimension of the low-rank matrices)
    lora_alpha=32,                 # Scaling factor for the LoRA updates
    lora_dropout=0.05,             # Dropout probability for the LoRA layers
    bias="none",                   # Type of bias to use (none, all, or lora_only)
    task_type="CAUSAL_LM",         # Specify the task type (Causal Language Modeling)
    target_modules=["q_proj", "k_proj", "v_proj", "dense"] # Modules to apply LoRA to
)

# Prepare the model for k-bit training (quantized training).
model = prepare_model_for_kbit_training(model)
# Get the PEFT model by applying the LoRA configuration to the base model.
model = get_peft_model(model, lora_config)

# [5] LOAD Banking77 DATASET:
# Load a subset of the 'banking77' dataset (first 500 examples of the training split).
dataset = load_dataset('banking77', split='train[:500]')
# Get the list of intent labels from the dataset features.
labels = dataset.features['label'].names

# Formatting function for prompts:
# Define a function to format each sample into a prompt suitable for instruction-following.
# It includes a system instruction, the user's text, and the expected assistant response (the intent label).
def format_prompt(sample):
    instruction = "You are a banking assistant. Classify the user's request into one of the predefined intents."
    return f"""<|system|>
{instruction}
<|user|>
{sample['text']}
<|assistant|>
{labels[sample['label']]}"""

# Preprocess dataset (tokenization):
# Define a function to tokenize each formatted prompt.
# It uses the loaded tokenizer, sets max length, truncation, and padding.
def preprocess(sample):
    return tokenizer(
        format_prompt(sample),
        max_length=256,          # Maximum sequence length
        truncation=True,         # Truncate sequences longer than max_length
        padding="max_length"     # Pad sequences shorter than max_length
    )

# Apply the preprocessing function to the dataset and remove the original columns.
tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

# [6] TRAINING ARGUMENTS (disable W&B reporting):
# Configure the training arguments using the TrainingArguments class.
training_args = TrainingArguments(
    output_dir="./banking_chatbot", # Directory to save model checkpoints and logs
    per_device_train_batch_size=2,  # Batch size per device during training
    gradient_accumulation_steps=2,  # Number of updates steps to accumulate before performing a backward/update pass
    learning_rate=2e-4,             # Learning rate for the optimizer
    num_train_epochs=1,             # Number of training epochs
    logging_steps=10,               # Log training progress every n steps
    fp16=True,                      # Enable mixed precision training (using float16)
    save_strategy="epoch",          # Save checkpoint at the end of each epoch
    report_to="none"                # Disable reporting to external services like Weights & Biases
)

# [7] TRAINER:
# Initialize the SFTTrainer (Supervised Fine-tuning Trainer) with the model, dataset, LoRA config, and training arguments.
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=lora_config,
    args=training_args
)

# Start the fine-tuning process.
print("Starting fine-tuning...")
trainer.train()
# Save the fine-tuned model (LoRA adapter weights) to the specified directory.
trainer.save_model("./banking_chatbot")
print("Fine-tuning complete! Model saved at './banking_chatbot'")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]



Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,5.5701
20,0.6611
30,0.3803
40,0.2035
50,0.1447
60,0.136
70,0.1208
80,0.099
90,0.1026
100,0.1166


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fine-tuning complete! Model saved at './banking_chatbot'


In [None]:
# Import necessary libraries and classes
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Specify the name of the base model
model_name = "microsoft/phi-2"

# [1] LOAD QUANTIZATION CONFIGURATION:
# Define the quantization configuration again to load the base model in 4-bit.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",       # Use the NF4 quantization type
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
    bnb_4bit_use_double_quant=True,  # Use double quantization
)

# [2] LOAD TOKENIZER:
# Load the tokenizer for the base model.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Set padding token and side.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# [3] LOAD BASE MODEL (quantized):
# Load the pre-trained base model with the quantization configuration.
# device_map="auto" automatically places the model on available devices.
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

# [4] LOAD FINE-TUNED MODEL (LoRA adapter):
# Load the fine-tuned model by applying the LoRA adapter weights saved during training
# to the quantized base model. PeftModel is used for this purpose.
ft_model = PeftModel.from_pretrained(base_model, "./banking_chatbot")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Define a function to generate a response from the model given user input.
def generate_response(model, user_input):
    # Format the user input into a prompt string following the defined structure.
    prompt = f"""<|system|>You are a helpful banking assistant.<|user|>{user_input}<|assistant|>"""
    # Tokenize the prompt and move it to the GPU (cuda) if available.
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    # Generate text using the model.
    outputs = model.generate(
        **inputs,
        max_new_tokens=80,  # Maximum number of new tokens to generate
        temperature=0.7,    # Controls the randomness of the output (higher = more random)
        top_p=0.9,          # Controls the diversity of the output (nucleolus sampling)
        do_sample=True,     # Enable sampling (instead of greedy decoding)
        eos_token_id=tokenizer.eos_token_id,   # Stop generation at the end-of-sequence token
        pad_token_id=tokenizer.eos_token_id    # Use EOS token for padding
    )
    # Decode the generated tokens back into text.
    # skip_special_tokens=True removes special tokens like EOS.
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Only return the text that comes after the last <|assistant|> tag,
    # which should be the model's generated response.
    return response.split("<|assistant|>")[-1].strip()

# [6] TEST WITH SAMPLE QUERY:
# Define a sample user query to test the models.
user_query = "I lost my credit card, what should I do?"

# Generate and print the response from the base model.
print("======== BASE MODEL OUTPUT ========")
print(generate_response(base_model, user_query))

# Generate and print the response from the fine-tuned model.
print("\n======== FINE-TUNED MODEL OUTPUT ========")
print(generate_response(ft_model, user_query))

You can still report the loss, but you will need to provide your name

In that case, you can contact your bank directly to report the loss and request a new
