In [1]:
# First cell: Setup and installations
!pip install unsloth
!pip install transformers
!pip install trl
!pip install rouge
!pip install scikit-learn

Collecting unsloth
  Downloading unsloth-2025.1.6-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.1.4 (from unsloth)
  Downloading unsloth_zoo-2025.1.5-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.13-py3-none-any.whl.metadata (9.4 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
# Second cell: Imports and configurations
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import numpy as np
import os

# Set up paths for Kaggle
OUTPUT_DIR = "/kaggle/working/finqa_finetuned"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Third cell: Data preparation functions
finqa_prompt = """Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer by providing only the mathematical or logical operations needed to solve the .
### Pre-text:
{}
### Table:
{}
### Post-text:
{}
### Question:
{}
### Response:
{}"""

def format_example(example):
    """Format a single example from FinQA dataset with simplified format"""
    pre_text = ' '.join(example['pre_text']) if example['pre_text'] else ""
    post_text = ' '.join(example['post_text']) if example['post_text'] else ""
    table = "\n".join(" | ".join(str(cell) for cell in row) for row in example['table']) if example['table'] else ""
    
    # Format output with operation and result on one line
    output = f"{example['program_re']} = {example['answer']}"
    
    # Format using the simplified template
    text = finqa_prompt.format(
        pre_text,
        table,
        post_text,
        example['question'],
        output
    ) + tokenizer.eos_token
    
    # Return as a dictionary with 'text' key
    return {"text": text}

def prepare_dataset():
    """Prepare FinQA dataset for fine-tuning"""
    from datasets import Dataset
    
    # Load dataset
    dataset = load_dataset("ibm/finqa", split="train", trust_remote_code=True)
    
    # Filter for valid examples
    train_data = [
        ex for ex in dataset
        if ex['program_re'] and ex['answer'] is not None  # Changed from expanded_program_re
    ]
    
    # Format examples
    formatted_examples = [format_example(ex) for ex in train_data]
    
    # Convert to Dataset object
    return Dataset.from_list(formatted_examples)

# Fourth cell: Model setup and training
# Load model and tokenizer
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Fifth cell: Training
# Prepare data
train_data = prepare_dataset()

# Prepare training arguments
train_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=100,
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.001,
    max_grad_norm=0.3,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir=OUTPUT_DIR,
    report_to="none",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=train_args,
)

# Print initial memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Train
print("\nStarting training...")
trainer_stats = trainer.train()

# Print final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
print(f"\nTraining completed in {trainer_stats.metrics['train_runtime']} seconds")
print(f"Peak memory usage: {used_memory} GB")
print(f"Memory used for training: {used_memory_for_lora} GB")

# Sixth cell: Save the model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


FastLanguageModel.for_inference(model)


# Seventh cell: Evaluation functions
def format_test_input(example):
    """Format a test example for inference"""
    pre_text = ' '.join(example['pre_text']) if example['pre_text'] else ""
    post_text = ' '.join(example['post_text']) if example['post_text'] else ""
    table = "\n".join(" | ".join(str(cell) for cell in row) for row in example['table']) if example['table'] else ""
    
    return finqa_prompt.format(
        pre_text,
        table,
        post_text,
        example['question'],
        ""  # Empty response for generation
    )

def extract_operations_result(response):
    """Extract operations and result from model response"""
    # Get only the Response section
    if "### Response:" in response:
        response = response.split("### Response:")[-1].strip()
    
    # Look for the equation format: operations = result
    if "=" in response:
        parts = response.split("=", 1)
        operations = parts[0].strip()
        result = parts[1].strip() if len(parts) > 1 else None
        return operations, result
    
    return None, None

def evaluate_sample(model, tokenizer, example):
    """Evaluate model on a single example"""
    # Format input
    input_text = format_test_input(example)
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
    
    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.2,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract generated and expected parts
    generated_ops, generated_result = extract_operations_result(response)
    expected_ops = example['program_re']  # Changed from expanded_program_re
    expected_result = str(example['answer'])
    
    # Calculate metrics
    rouge = Rouge()
    try:
        rouge_scores = rouge.get_scores(
            generated_ops or "",
            expected_ops
        )[0]
    except:
        rouge_scores = {
            'rouge-1': {'f': 0.0},
            'rouge-2': {'f': 0.0},
            'rouge-l': {'f': 0.0}
        }
    
    # Calculate Cosine similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([
            generated_ops or "",
            expected_ops
        ])
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except:
        cosine_sim = 0.0
    
    return {
        'question': example['question'],
        'expected_equation': f"{expected_ops} = {expected_result}",
        'generated_equation': f"{generated_ops} = {generated_result}" if generated_ops else None,
        'rouge1_f': rouge_scores['rouge-1']['f'],
        'rouge2_f': rouge_scores['rouge-2']['f'],
        'rougeL_f': rouge_scores['rouge-l']['f'],
        'cosine_similarity': cosine_sim,
        'raw_response': response
    }

# Eighth cell: Run evaluation
def evaluate_model(num_samples=50):
    """Evaluate model on test set samples"""
    print("Loading test dataset...")
    dataset = load_dataset("ibm/finqa", split="test", trust_remote_code=True)
    
    # Filter valid examples
    valid_examples = [
        ex for ex in dataset 
        if ex['program_re'] and ex['answer'] is not None
    ]
    
    # Select random samples
    test_samples = random.sample(valid_examples, min(num_samples, len(valid_examples)))
    
    # Evaluate samples
    results = []
    total_samples = len(test_samples)
    print(f"\nEvaluating {total_samples} samples...")
    
    for idx, sample in enumerate(test_samples, 1):
        print(f"\nSample {idx}/{total_samples}")
        result = evaluate_sample(model, tokenizer, sample)
        results.append(result)
        
        # Print individual result
        print(f"Question: {result['question']}")
        print(f"Expected: {result['expected_equation']}")
        print(f"Generated: {result['generated_equation']}")
        print(f"ROUGE-1 F1: {result['rouge1_f']:.3f}")
        print(f"ROUGE-2 F1: {result['rouge2_f']:.3f}")
        print(f"ROUGE-L F1: {result['rougeL_f']:.3f}")
        print(f"Cosine Similarity: {result['cosine_similarity']:.3f}")
    
    # Calculate average metrics
    avg_metrics = {
        'rouge1_f': np.mean([r['rouge1_f'] for r in results]),
        'rouge2_f': np.mean([r['rouge2_f'] for r in results]),
        'rougeL_f': np.mean([r['rougeL_f'] for r in results]),
        'cosine_similarity': np.mean([r['cosine_similarity'] for r in results])
    }
    
    print("\nOverall Results:")
    print("=" * 50)
    print(f"Average ROUGE-1 F1: {avg_metrics['rouge1_f']:.3f}")
    print(f"Average ROUGE-2 F1: {avg_metrics['rouge2_f']:.3f}")
    print(f"Average ROUGE-L F1: {avg_metrics['rougeL_f']:.3f}")
    print(f"Average Cosine Similarity: {avg_metrics['cosine_similarity']:.3f}")
    
    return results, avg_metrics

# Ninth cell: Execute evaluation
print("Starting model evaluation...")
results, metrics = evaluate_model(num_samples=50)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

==((====))==  Unsloth 2025.1.6: Fast Qwen2 patching. Transformers: 4.48.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Unsloth 2025.1.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


README.md:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

finqa.py:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

GPU = Tesla P100-PCIE-16GB. Max memory = 15.888 GB.
5.373 GB of memory reserved.

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,251 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 40,370,176


Step,Training Loss
10,1.653
20,1.4101
30,1.267
40,1.2345
50,1.1911
60,1.2112
70,1.2184
80,1.2222
90,1.1864
100,1.1822



Training completed in 6535.2531 seconds
Peak memory usage: 10.15 GB
Memory used for training: 4.777 GB
Starting model evaluation...
Loading test dataset...

Evaluating 50 samples...

Sample 1/50
Question: what is the total in millions of expected cash outflow to satisfy contractual obligations and commitments as of december 31 , 2007?
Expected: table_sum(total, none) = 28809
Generated: add(619, add(1782, add(10527, add(8026, add(3828, add(3465, const_562))))) = 23827
ROUGE-1 F1: 0.000
ROUGE-2 F1: 0.000
ROUGE-L F1: 0.000
Cosine Similarity: 0.000

Sample 2/50
Question: what is the net change in aon 2019s unpaid restructuring liabilities during 2007?
Expected: add(38, -110), add(#0, 1) = -71
Generated: subtract(63, const_134) = 31
ROUGE-1 F1: 0.000
ROUGE-2 F1: 0.000
ROUGE-L F1: 0.000
Cosine Similarity: 0.000

Sample 3/50
Question: what is the percent change of the amount of collateral held for indemnified securities between 2006 and 2007?
Expected: subtract(572.93, 527.37), divide(#0, 52