In [None]:
!pip install -U transformers
!pip install rouge-score nltk

In [None]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

## Dataset

The Warren Buffett Letters Q&A Enhanced Dataset (1998-2024) is a curated collection of question-answer pairs derived from Warren Buffett's annual shareholder letters to Berkshire Hathaway. The dataset spans 26 years of Buffett's investment wisdom and contains structured Q&A pairs extracted from real letter excerpts for conversational learning applications.

**Column Descriptions:**
- **question**: Financial queries covering investment philosophy, business analysis, market observations, and strategic decisions based on Buffett's letter content
- **answer**: Expert-level responses reflecting Buffett's investment principles, analytical approach, and decision-making framework
- **reasoning**: Detailed explanations of the underlying logic and investment principles behind each answer, providing context from Buffett's methodology

The dataset was chosen for its authentic representation of Warren Buffett's investment philosophy and its structured format that enables effective fine-tuning of sequence-to-sequence models for financial advisory applications in the value investing domain.

In [None]:
df = pd.read_parquet("hf://datasets/eagle0504/warren-buffett-letters-qna-r1-enhanced-1998-2024/data/train-00000-of-00001.parquet")

In [None]:
df.head()

## Model

FLAN-T5 is an enhanced version of Google's T5 (Text-to-Text Transfer Transformer) architecture. It extends the original T5 by fine-tuning on over 1,000 additional tasks across multiple languages, creating a more robust foundation model for natural language processing applications.

**Key advantages of FLAN-T5:**
- **Multilingual capabilities**: Trained on diverse language datasets for broader applicability  
- **Sequence-to-sequence excellence**: Proven effectiveness for conditional text generation tasks
- **Large-scale pre-training**: Extensive training corpus ensures strong foundational knowledge

FLAN-T5 was selected as the base model due to its demonstrated performance on sequence-to-sequence tasks and its ability to follow instructions effectively, making it well-suited for question-answering applications.

**Base Model**: [google/flan-t5-base](https://huggingface.co/google/flan-t5-base)

In [None]:
def load_tokenizer(model_name="google/flan-t5-base"):
    """
    Loads tokenizer for T5 model from HuggingFace Hub.

    Args:
        model_name: HuggingFace model identifier (default: "google/flan-t5-base")

    Returns:
        AutoTokenizer: Pre-trained tokenizer matching the specified model
    """
    # Load compatible tokenizer for text preprocessing
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer


In [None]:
def load_model(model_name="google/flan-t5-base"):
    """
    Loads pre-trained T5 model from HuggingFace Hub.

    Args:
        model_name: HuggingFace model identifier (default: "google/flan-t5-base")

    Returns:
        TFT5ForConditionalGeneration: Loaded TensorFlow T5 model for conditional generation
    """
    # Load pre-trained T5 model for sequence-to-sequence tasks
    model = TFT5ForConditionalGeneration.from_pretrained(model_name)
    return model

## Preprocessing

The dataset undergoes comprehensive preprocessing to ensure high-quality training data for the sequence-to-sequence model. The preprocessing pipeline consists of dataset combination, text cleaning, and quality filtering.

**Dataset Enhancement:**
The original financial Q&A data is combined with 75+ out-of-domain questions paired with standardized rejection responses, teaching the model to recognize and politely decline non-financial queries.

**Text Cleaning:**
Removes markdown formatting, escaped characters, and annotations while preserving content. Normalizes whitespace and extracts relevant questions from formatted text to create clean, uniform input.

**Quality Filtering:**
Eliminates entries with missing data and enforces minimum length requirements for financial content while preserving all rejection examples for robust domain boundary learning.

This preprocessing approach creates clean input that enables effective model training while teaching proper domain boundaries and preserving Buffett's investment insights.

In [None]:
def preprocess_for_chatbot(text):
    """
    Preprocesses text for chatbot input by cleaning formatting and extracting questions.

    Args:
        text: Raw text input (may contain markdown, escape characters, annotations)

    Returns:
        str: Cleaned text ready for chatbot processing
    """
    # Handle missing or null values
    if pd.isna(text):
        return ""

    text = str(text)

    # Replace escaped characters for readability
    text = text.replace("\\n", " ").replace("\\t", " ").replace("\\'", "'").replace('\\"', '"')

    # Extract first question from bold markdown format
    questions = re.findall(r'\*\*"([^"]*?)"\*\*', text)
    if questions:
        return questions[0].strip()

    # Remove markdown formatting
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # **text** → text
    text = re.sub(r'\*(.*?)\*', r'\1', text)      # *text* → text

    # Remove annotations in parentheses
    text = re.sub(r'\*\([^)]*\)\*', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove surrounding quotes if they wrap the entire text
    if text.startswith('"') and text.endswith('"') and text.count('"') == 2:
        text = text[1:-1].strip()

    return text

In [None]:
def preprocess_chatbot_dataset(df):
    """
    Preprocesses the entire chatbot dataset by cleaning text and filtering entries.

    Args:
        df: DataFrame containing question-answer pairs with optional reasoning

    Returns:
        DataFrame: Cleaned dataset ready for model training
    """
    # Apply text cleaning to all relevant columns
    for col in ['question', 'reasoning', 'answer']:
        if col in df.columns:
            df[col] = df[col].apply(preprocess_for_chatbot)

    # Remove entries with missing critical data
    df = df.dropna(subset=['question', 'answer'])

    # Filter out entries that are too short to be meaningful
    df = df[df['question'].str.len() > 5]
    df = df[df['answer'].str.len() > 10]

    print(f"Dataset ready: {len(df)} samples")

    # Display sample data for verification
    if len(df) > 0:
        print(f"\nSample preserved question:")
        print(f"'{df['question'].iloc[0]}'")
        print(f"\nSample preserved answer:")
        print(f"'{df['answer'].iloc[0]}'")
        print(f"\nSample preserved reasoning:")
        print(f"'{df['reasoning'].iloc[0]}'")

    return df

In [None]:
from sklearn.model_selection import train_test_split

def split_train_val(df, val_size=0.1):
    """
    Splits dataset into training and validation sets.

    Args:
        df: Input DataFrame to split
        val_size: Fraction for validation set (default: 0.2)

    Returns:
        tuple: (train_df, val_df) - Training and validation DataFrames
    """
    # Split with fixed random seed
    train_df, val_df = train_test_split(df, test_size=val_size, random_state=42)

    return train_df, val_df

In [None]:
def tokenize_data(df, tokenizer, batch_size=10):
    """
    Tokenizes question-answer pairs and returns a batched tf.data.Dataset.

    Args:
        df: DataFrame with 'question' and 'answer' columns
        tokenizer: Pre-trained tokenizer (e.g., from transformers library)
        batch_size: Number of samples per batch (default: 4)

    Returns:
        tf.data.Dataset: Batched dataset with input_ids, attention_mask, and labels
    """
    inputs = []
    labels = []
    attention_masks = []

    for index, row in df.iterrows():
        # Add context prompt during training (same as inference)
        input_text = f"Answer this financial question based on Warren Buffett's principles: {row['question']}"

        question_tokens = tokenizer(
            input_text,  # Use formatted input instead of raw question
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        inputs.append(question_tokens['input_ids'][0])
        attention_masks.append(question_tokens['attention_mask'][0])

        # Tokenize answer
        answer_tokens = tokenizer(
            row['answer'],
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        labels.append(answer_tokens['input_ids'][0])

    # Convert to tensors
    inputs = tf.stack(inputs)
    attention_masks = tf.stack(attention_masks)
    labels = tf.stack(labels)

    # Create and return batched dataset
    return tf.data.Dataset.from_tensor_slices({
        'input_ids': inputs,
        'attention_mask': attention_masks,
        'labels': labels
    }).batch(batch_size)

In [None]:
# Load the out-of-domain data
out_of_domain_df = pd.read_csv('/kaggle/input/out-of-domain-dataset/out_of_domain_training_data.csv')

# Combine both datasets
enhanced_df = pd.concat([df, out_of_domain_df], ignore_index=True)

# Shuffle the combined dataset
enhanced_df = enhanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df = preprocess_chatbot_dataset(enhanced_df)

In [None]:
train_df, val_df = split_train_val(df)

print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")

In [None]:
def train_model(train_dict, val_dict, learning_rate=5.8e-5, callbacks=[], epochs=40):
    """
    Fine-tunes FLAN-T5 model on question-answer pairs and returns history.
    
    Args:
        train_dict: Training dataset (tf.data.Dataset with input_ids, attention_mask, labels)
        val_dict: Validation dataset (tf.data.Dataset with same structure)
        learning_rate: Adam optimizer learning rate (default: 6e-5)
        callbacks: List of Keras callbacks for training monitoring (default: [])
        epochs: Number of training epochs (default: 30)
    
    Returns:
        tuple: (model, history) - Fine-tuned model and training history
    """
    # Load pre-trained FLAN-T5 base model
    model = load_model()
    
    # Configure Adam optimizer with specified learning rate
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=learning_rate,
        weight_decay=0.01
    )
    model.compile(optimizer=optimizer)
    
    # Fine-tune model with callbacks for training monitoring
    history = model.fit(
        train_dict,
        validation_data=val_dict,
        epochs=epochs,
        callbacks=callbacks
    )
    
    return model, history

In [None]:
import matplotlib.pyplot as plt


def plot_training_history(history):
    """
    Simple plot of training and validation loss.
    
    Args:
        history: Keras training history object
    """
    plt.figure(figsize=(10, 6))
    
    # Plot training loss
    plt.plot(history.history['loss'], label='Training Loss', color='blue', linewidth=2)
    
    # Plot validation loss if available
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation Loss', color='red', linewidth=2)
    
    plt.title('Training History', fontsize=16, fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Print final results
    final_train_loss = history.history['loss'][-1]
    print(f"Final Training Loss: {final_train_loss:.4f}")
    
    if 'val_loss' in history.history:
        final_val_loss = history.history['val_loss'][-1]
        print(f"Final Validation Loss: {final_val_loss:.4f}")

In [None]:
def get_simple_callbacks():
    """
    Creates standard Keras callbacks for training optimization and monitoring.
    Returns:
        list: Configured callbacks for early stopping, learning rate reduction, and model checkpointing
    """
    # Training hyperparameters - VERY aggressive to catch the plateau shown in graph
    PATIENCE = 2             # Back to original - very aggressive
    LR_REDUCTION_FACTOR = 0.2 # More aggressive than original 0.3
    LR_PATIENCE = 1           # Back to original - quick LR reduction
    MIN_LR = 1e-8            # Keep original - allows very small LRs
    MIN_DELTA = 0.0005       # Small but realistic - matches the small changes visible in graph
    RESTORE_BEST = True      # Keep this - important for avoiding overfitting
    
    callbacks = [
        # Prevent overfitting by stopping training when validation loss plateaus
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=PATIENCE,
            restore_best_weights=RESTORE_BEST,
            min_delta=MIN_DELTA,
            verbose=1,
            mode='min'           # Explicitly specify we want to minimize val_loss
        ),
        
        # Adaptive learning rate reduction when training stagnates
        tf.keras.callbacks.ReduceLROnPlateau(
            initial_lr = 5.8e-5,
            monitor='val_loss',
            factor=LR_REDUCTION_FACTOR,
            patience=LR_PATIENCE,
            min_lr=MIN_LR,
            min_delta=MIN_DELTA,
            cooldown=2,              # Increased from 1 - more cooldown time
            verbose=1,
            mode='min'
        ),
        
        # Save best performing model during training
        tf.keras.callbacks.ModelCheckpoint(
            filepath='./best_model.keras',
            monitor='val_loss',
            save_best_only=True,
            save_weights_only=False,  # Save full model, not just weights
            verbose=1,
            mode='min'
        ),
        
        # Optional: Add very aggressive early stopping as backup
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=1,              # Extremely aggressive - stops after 1 epoch of no improvement
            restore_best_weights=True,
            min_delta=0.0001,        # Very small threshold for final check
            verbose=1,
            mode='min'
        )
    ]
    
    return callbacks

In [None]:
tokenizer = load_tokenizer()

train_dict = tokenize_data(train_df, tokenizer)
val_dict = tokenize_data(val_df, tokenizer)

In [None]:
callbacks = get_simple_callbacks()

model, history = train_model(train_dict, val_dict, callbacks=callbacks)

In [None]:
plot_training_history(history)

In [None]:
question = "How often do you review your portfolio"

In [None]:
def predict_answer(question, model, tokenizer):
    """
    Generates answers to financial questions using fine-tuned FLAN-T5 model.

    Args:
        question: Input question text
        model: Fine-tuned TFT5ForConditionalGeneration model
        tokenizer: Corresponding T5 tokenizer

    Returns:
        str: Generated answer following Warren Buffett's investment principles
    """
    # Format input with same context prompt used during training
    input_text = f"Answer this financial question based on Warren Buffett's principles: {question}"

    # Tokenize input with padding and truncation for consistent format
    input_tokens = tokenizer(
        input_text,
        return_tensors="tf",
        max_length=256,
        padding='max_length',
        truncation=True
    )

    # Generate response using beam search for higher quality output
    generated_tokens = model.generate(
        input_tokens["input_ids"],
        attention_mask=input_tokens["attention_mask"],
        max_length=256,
        num_beams=4,
        early_stopping=True
    )

    # Decode generated tokens to readable text
    predicted_answer = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    return predicted_answer

# Example usage
predicted_answer = predict_answer(question, model, tokenizer)
print(f"\nQuestion: {question}")
print(f"Predicted Answer: {predicted_answer}")

In [None]:
def calculate_bleu(reference, candidate):
    """
    Calculates BLEU score to measure similarity between reference and generated text.

    Args:
        reference: Ground truth answer text
        candidate: Model-generated answer text

    Returns:
        float: BLEU score between 0 and 1 (higher = better similarity)
    """
    # Tokenize texts by splitting on whitespace
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()

    # Calculate sentence-level BLEU score
    return sentence_bleu([reference_tokens], candidate_tokens)

In [None]:
def calculate_rouge(reference, candidate):
    """
    Calculates ROUGE scores to evaluate text generation quality.

    Args:
        reference: Ground truth answer text
        candidate: Model-generated answer text

    Returns:
        dict: ROUGE scores with keys 'rouge1', 'rouge2', 'rougeL' (0-1 scale, higher = better)
    """
    # Initialize ROUGE scorer with stemming for better matching
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate all ROUGE variants
    scores = scorer.score(reference, candidate)

    # Return F1 scores for each ROUGE metric
    return {
        'rouge1': scores['rouge1'].fmeasure,   # Unigram overlap
        'rouge2': scores['rouge2'].fmeasure,   # Bigram overlap
        'rougeL': scores['rougeL'].fmeasure    # Longest common subsequence
    }

In [None]:
import math

def calculate_perplexity(model, tokenizer, text, max_length=256):
    """
    Calculates perplexity to measure how well the model predicts the given text.

    Args:
        model: Fine-tuned language model
        tokenizer: Corresponding tokenizer
        text: Input text to evaluate
        max_length: Maximum sequence length for tokenization (default: 256)

    Returns:
        float: Perplexity score (lower = better, inf if calculation fails)
    """
    try:
        # Tokenize input text with truncation for consistent length
        inputs = tokenizer(text, return_tensors='tf', max_length=max_length, truncation=True)
        input_ids = inputs['input_ids']

        # Calculate loss using input as both input and target (language modeling)
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

        # Convert cross-entropy loss to perplexity
        perplexity = math.exp(loss.numpy())
        return perplexity

    except:
        # Return infinity if calculation fails (e.g., empty text, model errors)
        return float('inf')

In [None]:
def evaluate_model(model, tokenizer, test_df, sample_number=10):
    """
    Evaluates model performance using multiple metrics on test data.

    Args:
        model: Fine-tuned model to evaluate
        tokenizer: Corresponding tokenizer
        test_df: Test dataset with 'question' and 'answer' columns
        sample_number: Number of samples to evaluate (default: 10)

    Returns:
        dict: Average scores for BLEU, ROUGE variants, and perplexity
    """
    bleu_scores = []
    rouge_scores = []
    perplexity_scores = []

    # Sample subset of test data for evaluation with fixed random seed
    test_df = test_df.sample(n=sample_number, random_state=42)

    for _, row in test_df.iterrows():
        # Use your predict_answer function for consistency
        prediction = predict_answer(row['question'], model, tokenizer)

        # Calculate evaluation metrics
        bleu = calculate_bleu(row['answer'], prediction)
        rouge = calculate_rouge(row['answer'], prediction)
        perplexity = calculate_perplexity(model, tokenizer, row['answer'])

        # Store scores for averaging
        bleu_scores.append(bleu)
        rouge_scores.append(rouge)
        perplexity_scores.append(perplexity)

    # Filter out infinite perplexity values
    valid_perplexities = [p for p in perplexity_scores if p != float('inf')]

    # Calculate average scores across all samples
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean([r['rouge1'] for r in rouge_scores])
    avg_rouge2 = np.mean([r['rouge2'] for r in rouge_scores])
    avg_rougeL = np.mean([r['rougeL'] for r in rouge_scores])
    avg_perplexity = np.mean(valid_perplexities) if valid_perplexities else float('inf')

    # Display evaluation results
    print(f"BLEU Score: {avg_bleu:.4f}")
    print(f"ROUGE-1: {avg_rouge1:.4f}")
    print(f"ROUGE-2: {avg_rouge2:.4f}")
    print(f"ROUGE-L: {avg_rougeL:.4f}")
    print(f"Perplexity: {avg_perplexity:.2f}")

    return {
        'bleu': avg_bleu,
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'perplexity': avg_perplexity
    }

In [None]:
metrics = evaluate_model(model, tokenizer, val_df)

In [None]:
from datetime import datetime

def save_model(model, tokenizer, name="my_model"):
    """
    Saves fine-tuned model and tokenizer with timestamped directory name.

    Args:
        model: Fine-tuned TFT5ForConditionalGeneration model
        tokenizer: Corresponding T5 tokenizer
        name: Base name for the saved model (default: "my_model")

    Returns:
        None: Creates timestamped directory in ./models/ with saved model files
    """
    # Generate timestamp for unique model versioning
    timestamp = datetime.now().strftime("%m%d_%H%M")
    full_name = f"{name}_{timestamp}"

    # Save model and tokenizer to timestamped directory
    model.save_pretrained(f"./models/{full_name}")
    tokenizer.save_pretrained(f"./models/{full_name}")

    print(f"Saved: {full_name}")

# Example usage
save_model(model, tokenizer, name="finance_chatbot")