Import the Libraries

In [3]:
import os
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import time


Load the Question-Answer pair json and convert to Prompt-Completion pair json

In [4]:

ROOT_DIR = "../"   # adjust if needed (depending on where the notebook lives)

# Load the 50 sampled Q/A pairs
qa_path = os.path.join(ROOT_DIR, "data", "qa_pairs_100.json")
with open(qa_path, "r") as f:
    qa_pairs = json.load(f)

# Convert to GPT prompt/completion format
gpt_ft_pairs = []
for item in qa_pairs:
    gpt_ft_pairs.append({
        "prompt":  item["question"],
        "completion": item["answer"]
    })

print(f"Converted {len(gpt_ft_pairs)} pairs.")

# Save to a new file for fine-tuning
ft_path = os.path.join(ROOT_DIR, "data", "qa_pairs_ft_100.json")
with open(ft_path, "w") as f:
    json.dump(gpt_ft_pairs, f, indent=2)

print(f"Saved GPT-style fine-tuning dataset to: {ft_path}")


Converted 100 pairs.
Saved GPT-style fine-tuning dataset to: ../data\qa_pairs_ft_100.json


Load the model (distilgpt2) and get the baseline results

In [5]:

# Load GPT-style Q/A data (from qa_pairs_ft_5000.json)
ft_path = os.path.join(ROOT_DIR, "data", "qa_pairs_ft_100.json")
with open(ft_path, "r") as f:
    gpt_pairs = json.load(f)

# Take first 10 questions as baseline test
baseline_questions = gpt_pairs[:10]

# Load pre-trained distilgpt2
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Evaluate
baseline_results = []
for item in baseline_questions:
    input_prompt = item["prompt"]
    gt_answer = item["completion"]
    
    start = time.time()
    inputs = tokenizer.encode(input_prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=50)
    elapsed = time.time() - start
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    baseline_results.append({
        "question": input_prompt,
        "generated_answer": answer,
        "ground_truth": gt_answer,
        "response_time_sec": elapsed
    })

baseline_results


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

[{'question': "What was BERKSHIRE HATHAWAY INC's total liabilities in 20221231?",
  'generated_answer': "What was BERKSHIRE HATHAWAY INC's total liabilities in 20221231?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
  'ground_truth': 'BERKSHIRE HATHAWAY INC reported total liabilities of 466784000000.0 in 20221231.',
  'response_time_sec': 2.448549747467041},
 {'question': "What was FIDELITY NATIONAL INFORMATION SERVICES, INC.'s cash and cash equivalents in 20241231?",
  'generated_answer': "What was FIDELITY NATIONAL INFORMATION SERVICES, INC.'s cash and cash equivalents in 20241231?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
  'ground_truth': 'FIDELITY NATIONAL INFORMATION SERVICES, INC. reported cash and cash equivalents of 834000000.0 in 20241231.',
  'response_time_sec': 2.61784029006958},
 {'question': "What was GLOBAL GAS CORP's net income in 20231231?",
  'generated

Fine-tune the model over the Prompt-Completion 100 pairs json dataset

In [None]:
# Manual fine-tuning of distilgpt2 (CPU friendly)

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os

# Load the prompt/completion pairs
ft_path = os.path.join(ROOT_DIR, "data", "qa_pairs_ft_100.json")
with open(ft_path, "r") as f:
    data = json.load(f)

# Load model + tokenizer
model_name = "distilgpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(model_name)

# ---- PAD TOKEN FIX ----
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Hyperparameters
num_epochs = 4
learning_rate = 5e-5

# Log hyperparameters and setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🧠 Fine-tuning configuration:")
print(f"• Model: {model_name}")
print(f"• Learning rate: {learning_rate}")
print(f"• Batch size: 1 (manual, per sample)")
print(f"• Number of epochs: {num_epochs}")
print(f"• Compute device: {device}")

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for entry in data:
        prompt = entry["prompt"]
        completion = entry["completion"]
        text = prompt + "\n" + completion

        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding="max_length")
        input_ids = inputs["input_ids"]

        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(data)
    print(f"Epoch {epoch+1}/{num_epochs}  |  avg loss = {avg_loss:.4f}")



Save the model

In [8]:
# Save fine-tuned model
save_path = "../models/fine_tuned_model"
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("✅ Fine-tuning complete.")
print(f"Model saved to: {save_path}")

✅ Fine-tuning complete.
Model saved to: ../models/fine_tuned_model


Test the model 

In [9]:
# Test the fine-tuned model on a few questions
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, time

# Load fine-tuned model
ft_model_path = "../models/fine_tuned_model"
model_ft = AutoModelForCausalLM.from_pretrained(ft_model_path)
tokenizer_ft = AutoTokenizer.from_pretrained(ft_model_path)

# Load the same 10 test questions we used before
with open(os.path.join(ROOT_DIR, "data", "qa_pairs_ft_100.json"), "r") as f:
    qa_full = json.load(f)

test_examples = qa_full[:10]

model_ft.eval()
for item in test_examples:
    prompt = item["prompt"]
    gt_answer = item["completion"]
    
    start = time.time()
    # Encode the prompt and attention mask
    inputs = tokenizer_ft(prompt, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate only the answer (don't repeat prompt)
    outputs = model_ft.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=50,
        do_sample=False,
        pad_token_id=tokenizer_ft.eos_token_id
    )

    # Remove the prompt part from the output
    generated_text = tokenizer_ft.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.replace(prompt, "").strip()

    elapsed = time.time() - start

    print("Q:", prompt)
    print("Generated:", answer)
    print("Ground truth:", gt_answer)
    print(f"Time: {elapsed:.2f}s")
    print("-" * 60)



Q: What was BERKSHIRE HATHAWAY INC's total liabilities in 20221231?
Generated: BERKSHIRE HATHAWAY INC reported total liabilities of -0.0 in 20221231.
Ground truth: BERKSHIRE HATHAWAY INC reported total liabilities of 466784000000.0 in 20221231.
Time: 1.13s
------------------------------------------------------------
Q: What was FIDELITY NATIONAL INFORMATION SERVICES, INC.'s cash and cash equivalents in 20241231?
Generated: FIDELITY NATIONAL INFORMATION SERVICES, INC. reported cash and cash equivalents of -0.0 in 20241231.
Ground truth: FIDELITY NATIONAL INFORMATION SERVICES, INC. reported cash and cash equivalents of 834000000.0 in 20241231.
Time: 1.02s
------------------------------------------------------------
Q: What was GLOBAL GAS CORP's net income in 20231231?
Generated: GLOBAL GAS CORP reported net income of -0.0 in 20231231.
Ground truth: GLOBAL GAS CORP reported net income of -300176.0 in 20231231.
Time: 0.99s
------------------------------------------------------------
Q: Wha

Apply Continual Learning/ Domain Adaptation technique for better results

In [None]:
# =============== Continual Learning / Domain Adaptation (Step 3.5) ===============

import pandas as pd

# Load the already fine-tuned model
ft_model_path = "../models/fine_tuned_model"
model = AutoModelForCausalLM.from_pretrained(ft_model_path)
tokenizer = AutoTokenizer.from_pretrained(ft_model_path)

# Load financial_sentences.csv to use as domain-adaptation text
csv_path = os.path.join(ROOT_DIR, "data", "processed", "financial_sentences.csv")
df = pd.read_csv(csv_path)

# Optional: sample ~200 sentences (small continual-learning batch)
domain_sentences = df['sentence'].drop_duplicates().sample(200, random_state=42).tolist()

# Convert to prompt/completion pairs (self-supervised style)
domain_pairs = [{"prompt": s, "completion": s} for s in domain_sentences]

# Second training pass (few epochs)
num_epochs = 2
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for entry in domain_pairs:
        text = entry["prompt"] + "\n" + entry["completion"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding="max_length")
        input_ids = inputs["input_ids"]

        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(domain_pairs)
    print(f"[Continual Phase] Epoch {epoch+1}/{num_epochs} | avg loss = {avg_loss:.4f}")




Save the updated model

In [12]:
# Save updated model
cont_path = "../models/fine_tuned_model"  # overwrite previous version
model.save_pretrained(cont_path)
tokenizer.save_pretrained(cont_path)

print("✅ Continual learning stage complete (model updated).")

✅ Continual learning stage complete (model updated).


Implement Guardrail

In [13]:
# ------------------------ Guardrail (Input Filtering) ------------------------

# We define a basic list of FINANCIAL keywords
FINANCIAL_KEYWORDS = [
    "revenue", "net income", "liabilities", "assets", "cash", "equivalents",
    "earnings", "profit", "income", "expenses"
]

def is_financial_question(question: str) -> bool:
    """ Returns True if the question contains any financial keyword. """
    q_lower = question.lower()
    for kw in FINANCIAL_KEYWORDS:
        if kw in q_lower:
            return True
    return False

# Example usage in your inference function
def generate_answer_with_guardrail(question: str, model, tokenizer):
    # Guardrail check
    if not is_financial_question(question):
        return "[Input Guardrail] Sorry, I can only answer financial-statement-related questions."
    
    # Normal generation
    inputs = tokenizer.encode(question, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [14]:
print( generate_answer_with_guardrail("What is the capital of France?", model_ft, tokenizer_ft) )

print( generate_answer_with_guardrail("What was Amazon's revenue in 2023?", model_ft, tokenizer_ft) )


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[Input Guardrail] Sorry, I can only answer financial-statement-related questions.
What was Amazon's revenue in 2023?
Amazon reported revenue of -0.0 in 2023.


Confidence score function

In [20]:
def generate_with_confidence(prompt, model, tokenizer, max_new_tokens=50):
    model.eval()

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    
    # Generate output with scores
    outputs = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        return_dict_in_generate=True,
        output_scores=True,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract generated tokens (excluding the prompt)
    generated_tokens = outputs.sequences[0][input_ids.shape[-1]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    # Get token-level probabilities
    scores = outputs.scores  # List of logits per token
    token_confidences = []
    
    for i, (score, token_id) in enumerate(zip(scores, generated_tokens)):
        try:
            # Get the probability distribution for this token position
            probs = F.softmax(score, dim=-1)
            # Get the probability of the generated token
            token_prob = probs[0, token_id].item()
            token_confidences.append(token_prob)
        except IndexError:
            # If there's an index error, skip this token
            token_confidences.append(0.0)
            continue

    avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.0

    return {
        "generated_text": generated_text.strip(),
        "token_confidences": token_confidences,
        "avg_confidence": avg_confidence
    }

Test the fine-tuned model

In [21]:
import os
# Test the fine-tuned model on a few questions
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, time

# Load fine-tuned model
ft_model_path = "../models/fine_tuned_model"
model_ft = AutoModelForCausalLM.from_pretrained(ft_model_path)
tokenizer_ft = AutoTokenizer.from_pretrained(ft_model_path)

# Load the same 10 test questions we used before
with open(os.path.join(ROOT_DIR, "data", "qa_pairs_ft_100.json"), "r") as f:
    qa_full = json.load(f)

test_examples = qa_full[:10]

model_ft.eval()
for item in test_examples:
    prompt = item["prompt"]
    gt_answer = item["completion"]

    start = time.time()
    result = generate_with_confidence(prompt, model_ft, tokenizer_ft)
    elapsed = time.time() - start

    print("Q:", prompt)
    print("Generated:", result["generated_text"])
    print("Ground Truth:", gt_answer)
    print(f"Confidence: {result['avg_confidence']:.3f}")
    print(f"Time: {elapsed:.2f}s")
    print("-" * 60)


Q: What was BERKSHIRE HATHAWAY INC's total liabilities in 20221231?
Generated: BERKSHIRE HATHAWAY INC reported total liabilities of 101000000.0 in 20221231.
Ground Truth: BERKSHIRE HATHAWAY INC reported total liabilities of 466784000000.0 in 20221231.
Confidence: 0.896
Time: 1.04s
------------------------------------------------------------
Q: What was FIDELITY NATIONAL INFORMATION SERVICES, INC.'s cash and cash equivalents in 20241231?
Generated: FIDELITY NATIONAL INFORMATION SERVICES, INC. reported cash and cash equivalents of 20241231.
Ground Truth: FIDELITY NATIONAL INFORMATION SERVICES, INC. reported cash and cash equivalents of 834000000.0 in 20241231.
Confidence: 0.978
Time: 1.08s
------------------------------------------------------------
Q: What was GLOBAL GAS CORP's net income in 20231231?
Generated: GLOBAL GAS CORP reported net income of -0.0 in 20231231.
Ground Truth: GLOBAL GAS CORP reported net income of -300176.0 in 20231231.
Confidence: 0.910
Time: 1.08s
--------------