In [1]:
!pip install -U transformers
!pip install rouge-score nltk

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.4
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=75b18a0cefa96e7758076ba165d8e0b319dd90f203b2535b17e88bacef4e4775
  Stored in directory: /root/.cache/pip/w

Model page: https://huggingface.co/google/flan-t5-base

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google/flan-t5-base)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [2]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

2025-06-15 18:43:08.000160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750012988.212636      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750012988.267024      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
df = pd.read_parquet("hf://datasets/eagle0504/warren-buffett-letters-qna-r1-enhanced-1998-2024/data/train-00000-of-00001.parquet")

In [4]:
df.head()

Unnamed: 0,question,answer,reasoning
0,"**""How does Warren Buffett's principle of 'pra...",A good answer would be: \n\n*Warren Buffett e...,The reasoning is as follows: \n\n1. **Context...
1,Here are a few strong questions worth asking b...,A good answer would be: \n\nWarren Buffett ac...,"Warren Buffett emphasizes transparency, accoun..."
2,"**""How does Warren Buffett's principle of 'pra...",Here’s a concise answer derived from the parag...,The reasoning is as follows: \n\n1. **Context...
3,"**""How does Warren Buffett's principle of 'pra...",A good answer would highlight Warren Buffett's...,The reasoning is as follows: \n\n1. **Context...
4,"**""How does Warren Buffett's principle of 'pra...",A good answer would highlight Warren Buffett's...,The reasoning is as follows: \n\n1. **Context...


In [5]:
def preprocess_for_chatbot(text):
    # Return empty string if the text is empty
    if pd.isna(text):
        return ""

    text = str(text)

    # Replaces escaped charecters for readability
    text = text.replace("\\n", " ").replace("\\t", " ").replace("\\'", "'").replace('\\"', '"')

    # Use only the first question
    questions = re.findall(r'\*\*"([^"]*?)"\*\*', text)
    if questions:
        return questions[0].strip()

    # Handle Markdown from the text
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # **text** → text
    text = re.sub(r'\*(.*?)\*', r'\1', text)      # *text* → text

    # Removes annotation in text .i.e (explanation)
    text = re.sub(r'\*\([^)]*\)\*', '', text)

    # Makes sure the spacing is uniform
    text = re.sub(r'\s+', ' ', text).strip()

    # Handle quotes
    if text.startswith('"') and text.endswith('"') and text.count('"') == 2:
        text = text[1:-1].strip()

    return text

In [6]:
def preprocess_chatbot_dataset(df):
    # Apply minimal cleaning
    for col in ['question', 'reasoning', 'answer']:
        if col in df.columns:
            df[col] = df[col].apply(preprocess_for_chatbot)

    # Remove empty entries
    df = df.dropna(subset=['question', 'answer'])
    df = df[df['question'].str.len() > 5]
    df = df[df['answer'].str.len() > 10]

    print(f"Dataset ready: {len(df)} samples")

    # Show sample question and answer
    if len(df) > 0:
        print(f"\nSample preserved question:")
        print(f"'{df['question'].iloc[0]}'")
        print(f"\nSample preserved answer:")
        print(f"'{df['answer'].iloc[0]}'")
        print(f"\nSample preserved reasoning:")
        print(f"'{df['reasoning'].iloc[0]}'")

    return df


In [7]:
clean_df = preprocess_chatbot_dataset(df)

Dataset ready: 10657 samples

Sample preserved question:
'How does Warren Buffett's principle of 'praise by name, criticize by category' reflect his broader philosophy on leadership and accountability?'

Sample preserved answer:
'A good answer would be: Warren Buffett emphasizes the importance of transparency, accountability, and prompt corrective action when mistakes occur in business. He acknowledges that errors in capital allocation and personnel decisions are inevitable, but the real failure is in delaying fixes—what Charlie Munger called "thumb-sucking." Unlike many corporations that avoid admitting mistakes, Buffett believes in openly discussing both successes and failures to maintain trust with shareholders.'

Sample preserved reasoning:
'The reasoning is as follows: 1. Context from the Paragraph: Warren Buffett openly discusses Berkshire Hathaway's mistakes in capital allocation, personnel decisions, and delayed corrections, contrasting this transparency with other companies th

In [8]:
def load_tokenizer(model_name="google/flan-t5-base"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer


In [9]:
def load_model(model_name="google/flan-t5-base"):
    model = TFT5ForConditionalGeneration.from_pretrained(model_name)
    return model

In [10]:
def format_qa_pair(question, answer):
    input_text = f"Answer this financial question based on Warren Buffett's principles: {question}"

    target_text = answer
    return input_text, target_text

In [11]:
from sklearn.model_selection import train_test_split

def split_train_val(df, val_size=0.2):
    train_df, val_df = train_test_split(df, test_size=val_size, random_state=42)

    return train_df, val_df

In [12]:
import tensorflow as tf

def tokenize_data(df, tokenizer, batch_size=3):
    inputs = []
    labels = []
    attention_masks = [] # Add attention mask

    for index, row in df.iterrows():
        # Tokenize question and get attention mask
        question_tokens = tokenizer(row['question'], max_length=256, padding='max_length', truncation=True, return_tensors='tf')
        inputs.append(question_tokens['input_ids'][0])
        attention_masks.append(question_tokens['attention_mask'][0]) # Append attention mask

        # Tokenize answer
        answer_tokens = tokenizer(row['answer'], max_length=256, padding='max_length', truncation=True, return_tensors='tf')
        labels.append(answer_tokens['input_ids'][0])


    # Convert lists to tensors
    inputs = tf.stack(inputs)
    attention_masks = tf.stack(attention_masks) # Stack attention masks
    labels = tf.stack(labels)

    # Return as a dictionary
    return tf.data.Dataset.from_tensor_slices({
        'input_ids': inputs,
        'attention_mask': attention_masks,
        'labels': labels
    }).batch(batch_size)

In [13]:
tokenizer = load_tokenizer()

train_df, val_df = split_train_val(df)

print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Train samples: 8525
Val samples: 2132


In [14]:
def train_model(train_dict, val_dict, learning_rate=5e-5, callbacks=[], epochs=25):
    model = load_model()

    # Compile with optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer)

    # Train with proper dataset format
    model.fit(
        train_dict,
        validation_data=val_dict,
        epochs=epochs
    )

    return model

In [15]:
train_dict = tokenize_data(train_df, tokenizer)
val_dict = tokenize_data(val_df, tokenizer)

I0000 00:00:1750013018.841791      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [16]:
def get_simple_callbacks():    
    # TUNABLE CONSTANTS - Change these easily
    PATIENCE = 1              
    LR_REDUCTION_FACTOR = 0.5 
    LR_PATIENCE = 2           
    MIN_LR = 1e-7            
    MIN_DELTA = 0.01         
    RESTORE_BEST = True
    
    callbacks = [
        # Stop training if no improvement
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=PATIENCE,
            restore_best_weights=RESTORE_BEST,
            min_delta=MIN_DELTA,
            verbose=1
        ),
        
        # Reduce learning rate when stuck
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=LR_REDUCTION_FACTOR,
            patience=LR_PATIENCE,
            min_lr=MIN_LR,
            verbose=1
        ),
        
        # Save best model
        tf.keras.callbacks.ModelCheckpoint(
            filepath='./best_model',
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        )
    ]
    
    return callbacks

In [17]:
callbacks = get_simple_callbacks()

model = train_model(train_dict, val_dict, callbacks=callbacks)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Epoch 1/25


I0000 00:00:1750013148.763815     128 service.cc:148] XLA service 0x7f87d0036060 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750013148.764534     128 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1750013148.841986     128 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1750013149.025963     128 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/25
Epoch 3/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [18]:
question = "What philosophy do you use when deciding if a company is valuable?"

In [19]:
# Make a prediction
def predict_answer(question, model, tokenizer):
    # Format the input similar to training
    input_text = f"Answer this financial question based on Warren Buffett's principles: {question}"

    # Tokenize the input
    input_tokens = tokenizer(input_text, return_tensors="tf", max_length=256, padding='max_length', truncation=True)

    # Generate the answer
    generated_tokens = model.generate(
        input_tokens["input_ids"],
        attention_mask=input_tokens["attention_mask"],
        max_length=256,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated tokens back to text
    predicted_answer = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    return predicted_answer

predicted_answer = predict_answer(question, model, tokenizer)
print(f"\nQuestion: {question}")
print(f"Predicted Answer: {predicted_answer}")


Question: What philosophy do you use when deciding if a company is valuable?
Predicted Answer: Here’s a strong answer derived from the paragraph and question: "Warren Buffett values businesses with long-term competitive advantages in stable industries, even if they have low organic growth, because they generate high returns on invested capital and free cash flow that can be reinvested elsewhere. He emphasizes transparency in reporting so shareholders can make informed decisions, and he highlights Berkshire Hathaway’s transparency about acquisitions and investments, which aligns with his philosophy of avoiding misleading metrics like EBITDA."


In [20]:
def calculate_bleu(reference, candidate):
    """BLEU score for text similarity"""
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    return sentence_bleu([reference_tokens], candidate_tokens)

In [21]:
def calculate_rouge(reference, candidate):
    """ROUGE score for text quality"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

In [22]:
import math

def calculate_perplexity(model, tokenizer, text, max_length=256):
    try:
        inputs = tokenizer(text, return_tensors='tf', max_length=max_length, truncation=True)
        input_ids = inputs['input_ids']
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = math.exp(loss.numpy())
        return perplexity
    except:
        return float('inf')

In [23]:
def evaluate_model(model, tokenizer, test_df, sample_number=10):
    bleu_scores = []
    rouge_scores = []
    perplexity_scores = []

    test_df = test_df.sample(n=sample_number, random_state=42)

    for _, row in test_df.iterrows():
        # Generate prediction
        input_text = f"Answer this financial question based on Warren Buffett's principles: {row['question']}"
        inputs = tokenizer.encode(input_text, return_tensors='tf')
        outputs = model.generate(inputs, max_length=200)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Calculate metrics
        bleu = calculate_bleu(row['answer'], prediction)
        rouge = calculate_rouge(row['answer'], prediction)
        perplexity = calculate_perplexity(model, tokenizer, row['answer'])

        bleu_scores.append(bleu)
        rouge_scores.append(rouge)
        perplexity_scores.append(perplexity)

    # Filter out infinite perplexities
    valid_perplexities = [p for p in perplexity_scores if p != float('inf')]

    # Average scores
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean([r['rouge1'] for r in rouge_scores])
    avg_rouge2 = np.mean([r['rouge2'] for r in rouge_scores])
    avg_rougeL = np.mean([r['rougeL'] for r in rouge_scores])
    avg_perplexity = np.mean(valid_perplexities) if valid_perplexities else float('inf')

    print(f"BLEU Score: {avg_bleu:.4f}")
    print(f"ROUGE-1: {avg_rouge1:.4f}")
    print(f"ROUGE-2: {avg_rouge2:.4f}")
    print(f"ROUGE-L: {avg_rougeL:.4f}")
    print(f"Perplexity: {avg_perplexity:.2f}")

    return {
        'bleu': avg_bleu,
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'perplexity': avg_perplexity
    }

In [24]:
metrics = evaluate_model(model, tokenizer, val_df)

  perplexity = math.exp(loss.numpy())
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.1808
ROUGE-1: 0.4916
ROUGE-2: 0.2965
ROUGE-L: 0.3876
Perplexity: 1.36


In [25]:
from datetime import datetime

def save_model(model, tokenizer, name="my_model"):
    """Simple save function with timestamp"""
    timestamp = datetime.now().strftime("%m%d_%H%M")
    full_name = f"{name}_{timestamp}"
    model.save_pretrained(f"./models/{full_name}")
    tokenizer.save_pretrained(f"./models/{full_name}")
    print(f"Saved: {full_name}")

save_model(model, tokenizer, name="finance_chatbot")

Saved: finance_chatbot_0616_0137
