<a href="https://colab.research.google.com/github/Annemarie535257/Financial_ChatBot/blob/main/LLM_Financial_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
# Environment and configuration
import os
import time
import random
import math

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from transformers import (
    T5TokenizerFast,
    TFT5ForConditionalGeneration,
    create_optimizer,
)

import datasets as hf_datasets
import evaluate as hf_evaluate

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Paths
DATA_PATH = os.path.join(os.getcwd(), 'bitext-mortgage-loans-llm-chatbot-training-dataset.csv')
SAVE_ROOT = os.path.join(os.getcwd(), 'saved_models')
os.makedirs(SAVE_ROOT, exist_ok=True)

# Model and training hyperparameters
MODEL_NAME = 'google/flan-t5-small'
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 3e-4
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01

RUN_ID = time.strftime('HUFI_V1_FLAN_T5_%Y%m%d_%H%M%S')
OUTPUT_DIR = os.path.join(SAVE_ROOT, RUN_ID)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Run ID: {RUN_ID}\nSaving to: {OUTPUT_DIR}')


Run ID: HUFI_V1_FLAN_T5_20251013_085942
Saving to: /content/saved_models/HUFI_V1_FLAN_T5_20251013_085942


In [3]:
# Load and inspect dataset
assert os.path.exists(DATA_PATH), f"Dataset not found at {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print(df.head(2))
print('Columns:', df.columns.tolist())
print('Shape:', df.shape)

# Identify columns
QUESTION_COL = 'instruction' if 'instruction' in df.columns else df.columns[1]
ANSWER_COL = 'response' if 'response' in df.columns else df.columns[-1]

# Clean basic
for col in [QUESTION_COL, ANSWER_COL]:
    df[col] = df[col].astype(str).str.strip()

df = df.dropna(subset=[QUESTION_COL, ANSWER_COL])
df = df.drop_duplicates(subset=[QUESTION_COL, ANSWER_COL])
print('After cleaning:', df.shape)


                                       system_prompt  \
0  You are an expert in customer support for mort...   
1  You are an expert in customer support for mort...   

                                         instruction          intent  \
0  could you help me to add a co-borrower to my m...  add_coborrower   
1  I would like to add a co-borrower ot my auto l...  add_coborrower   

             category    tags  \
0  LOAN_MODIFICATIONS    BILP   
1  LOAN_MODIFICATIONS  BCILPZ   

                                            response  
0  I'm on it! I'm here to assist you with adding ...  
1  Absolutely! I'm here to assist you in adding a...  
Columns: ['system_prompt', 'instruction', 'intent', 'category', 'tags', 'response']
Shape: (36778, 6)
After cleaning: (36778, 6)


In [4]:
# Stratified train/val/test split by intent if available
from sklearn.model_selection import train_test_split

stratify_col = df['intent'] if 'intent' in df.columns else None
train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED, shuffle=True, stratify=stratify_col)
stratify_col_tv = train_df['intent'] if 'intent' in train_df.columns else None
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=SEED, shuffle=True, stratify=stratify_col_tv)

print('Split sizes -> train:', len(train_df), 'val:', len(val_df), 'test:', len(test_df))


Split sizes -> train: 29790 val: 3310 test: 3678


In [5]:
# Tokenizer and formatting

tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
PREFIX = 'answer the question: '

def format_example(question: str, answer: str):
    return PREFIX + question, answer

for i in range(2):
    s, t = format_example(train_df.iloc[i][QUESTION_COL], train_df.iloc[i][ANSWER_COL])
    print('SRC:', s[:100])
    print('TGT:', t[:100])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

SRC: answer the question: could ya help me applying for an fucking home loan with my partner
TGT: I genuinely apologize for the language used in your instruction and any frustration it may have caus
SRC: answer the question: I'd like to see ur niterest rates can i get some help
TGT: Unquestionably! I'd be more than happy to assist you with finding the information about our interest


In [6]:
# Build Hugging Face datasets

def to_hf_dataset(frame: pd.DataFrame) -> hf_datasets.Dataset:
    sources, targets = [], []
    for _, row in frame.iterrows():
        s, t = format_example(row[QUESTION_COL], row[ANSWER_COL])
        sources.append(s)
        targets.append(t)
    return hf_datasets.Dataset.from_dict({'source': sources, 'target': targets})

raw_train = to_hf_dataset(train_df)
raw_val = to_hf_dataset(val_df)
raw_test = to_hf_dataset(test_df)

print(raw_train[0])


{'source': 'answer the question: could ya help me applying for an fucking home loan with my partner', 'target': "I genuinely apologize for the language used in your instruction and any frustration it may have caused. I can definitely assist you in applying for a home loan with your partner. Let's get started!\n\nApplying for a home loan with a partner can be a great way to share the financial responsibility. Here's what you need to do:\n\n1. Research and compare different lenders to find the one that suits your needs. Consider factors like interest rates, repayment terms, and customer reviews.\n\n2. Gather the necessary documentation such as proof of income, identification documents, bank statements, and employment history for both you and your partner. You may also be required to provide details about the property you're interested in.\n\n3. Contact the chosen lender or apply through their website. You'll need to provide the required information and submit the application along with t

In [7]:
# Tokenize datasets

def tokenize_function(batch):
    model_inputs = tokenizer(
        batch['source'],
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding='max_length',
        return_tensors='np',
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch['target'],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding='max_length',
            return_tensors='np',
        )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_tokenized = raw_train.map(tokenize_function, batched=True, remove_columns=['source','target'])
val_tokenized = raw_val.map(tokenize_function, batched=True, remove_columns=['source','target'])
test_tokenized = raw_test.map(tokenize_function, batched=True, remove_columns=['source','target'])

for ds in [train_tokenized, val_tokenized, test_tokenized]:
    ds.set_format(type='numpy')


Map:   0%|          | 0/29790 [00:00<?, ? examples/s]



Map:   0%|          | 0/3310 [00:00<?, ? examples/s]

Map:   0%|          | 0/3678 [00:00<?, ? examples/s]

In [8]:
# tf.data input pipelines

def to_tf_dataset(tokenized: hf_datasets.Dataset, batch_size: int) -> tf.data.Dataset:
    feats = {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'labels': tokenized['labels'],
    }
    def gen():
        for i in range(len(tokenized)):
            yield {k: feats[k][i] for k in feats}
    sig = {
        'input_ids': tf.TensorSpec(shape=(None,), dtype=tf.int32),
        'attention_mask': tf.TensorSpec(shape=(None,), dtype=tf.int32),
        'labels': tf.TensorSpec(shape=(None,), dtype=tf.int32),
    }
    return tf.data.Dataset.from_generator(gen, output_signature=sig).shuffle(1024, seed=SEED).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = to_tf_dataset(train_tokenized, BATCH_SIZE)
val_ds = to_tf_dataset(val_tokenized, BATCH_SIZE)


In [9]:
# Model and training loop

from google.colab import drive
drive.mount('/content/drive')

tf.keras.backend.clear_session()
model = TFT5ForConditionalGeneration.from_pretrained(MODEL_NAME, from_pt=True)

num_train_steps = math.ceil(len(train_tokenized) / BATCH_SIZE) * EPOCHS
num_warmup_steps = int(num_train_steps * WARMUP_RATIO)

optimizer, lr_schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=num_warmup_steps,
    num_train_steps=num_train_steps,
    weight_decay_rate=WEIGHT_DECAY,
)

model.compile(optimizer=optimizer)

# Change OUTPUT_DIR to a path in Google Drive
OUTPUT_DIR = os.path.join('/content/drive/MyDrive', 'saved_models', RUN_ID)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Run ID: {RUN_ID}\nSaving to: {OUTPUT_DIR}')

ckpt_cb = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(OUTPUT_DIR, 'ckpt'),
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
)

es_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[ckpt_cb, es_cb],
)

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print('Saved to', OUTPUT_DIR)

Mounted at /content/drive


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5ForConditionalGeneration: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
- This IS expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can al

Run ID: HUFI_V1_FLAN_T5_20251013_085942
Saving to: /content/drive/MyDrive/saved_models/HUFI_V1_FLAN_T5_20251013_085942
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Saved to /content/drive/MyDrive/saved_models/HUFI_V1_FLAN_T5_20251013_085942


In [10]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacreb

In [11]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5580b18bc373d422fc59de62af814728878914124b6a9c5abbfe1b80138035c9
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
# Evaluation: BLEU, ROUGE-L, perplexity

bleu = hf_evaluate.load('sacrebleu')
rouge = hf_evaluate.load('rouge')

def generate_answers(questions, max_new_tokens=64):
    inputs = tokenizer(['answer the question: ' + q for q in questions], return_tensors='tf', padding=True, truncation=True, max_length=MAX_SOURCE_LENGTH)
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

sample_size = min(256, len(val_df))
sample_questions = val_df[QUESTION_COL].tolist()[:sample_size]
sample_refs = [[a] for a in val_df[ANSWER_COL].tolist()[:sample_size]]

preds = generate_answers(sample_questions)

bleu_res = bleu.compute(predictions=preds, references=sample_refs)
rouge_res = rouge.compute(predictions=preds, references=[r[0] for r in sample_refs])

val_loss = model.evaluate(val_ds, return_dict=True)['loss']
perplexity = math.exp(val_loss) if val_loss < 20 else float('inf')

print('BLEU:', bleu_res)
print('ROUGE-L:', rouge_res.get('rougeL'))
print('Val loss:', val_loss, 'Perplexity:', perplexity)


BLEU: {'score': 3.5827939633894372, 'counts': [10437, 6566, 4680, 3458], 'totals': [14192, 13936, 13680, 13424], 'precisions': [73.5414317925592, 47.11538461538461, 34.21052631578947, 25.759833134684147], 'bp': 0.08570821060272206, 'sys_len': 14192, 'ref_len': 49059}
ROUGE-L: 0.2841221430578904
Val loss: 0.5689411759376526 Perplexity: 1.7663957585907928


In [18]:
# Qualitative test: predictions vs references on a small sample

NUM_EXAMPLES = 10
MAX_NEW_TOKENS = 64

sample_questions = val_df[QUESTION_COL].tolist()[:NUM_EXAMPLES]
sample_refs = val_df[ANSWER_COL].tolist()[:NUM_EXAMPLES]

def generate_answers_list(questions, max_new_tokens=64):
    inputs = tokenizer(
        ['answer the question: ' + q for q in questions],
        return_tensors='tf',
        padding=True,
        truncation=True,
        max_length=MAX_SOURCE_LENGTH,
    )
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

preds = generate_answers_list(sample_questions, max_new_tokens=MAX_NEW_TOKENS)

for i, (q, ref, pred) in enumerate(zip(sample_questions, sample_refs, preds), 1):
    print(f'--- Example {i} ---')
    print('Q:', q)
    print('REF:', ref[:400])
    print('PRED:', pred[:400])
    print()

print(f'Shown {NUM_EXAMPLES} qualitative examples.')


--- Example 1 ---
Q: need to update my fucking prfeerred account number can ya help me
REF: It's our responsibility, and we're sorry for the frustration you may have experienced while trying to update your preferred account number. I'm here to assist you with that. Rest assured, we'll get it sorted out. To update your preferred account number, please follow these steps:

1. Log in to your account on our website/app.
2. Navigate to the "Account Settings" or "Profile" section.
3. Look for 
PRED: I genuinely apologize for any frustration you may have experienced while trying to update your preferred account number. I'm here to assist you with that. Let's get it sorted out. To update your preferred account number, please follow these steps: 1. Log in to your account on our website or mobile app.

--- Example 2 ---
Q: how to pay off my fucking loans earlier
REF: I genuinely apologize for the frustration you may be feeling. I understand that paying off your loans earlier is an important goal