In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "microsoft/Phi-3-mini-4k-instruct"  # 3.8B model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")  # Auto maps to GPU if available

print("Phi-3 (3.8B) model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Phi-3 (3.8B) model loaded successfully!


In [2]:
import json
import pandas as pd

# Load the dataset
def load_finqa_dataset(path):
    with open(path, "r") as f:
        data = json.load(f)
    return data

# Load training, validation, and test sets
train_data = load_finqa_dataset("/cs/student/projects2/aisd/2024/giliev/FinQA/dataset/train.json")
dev_data = load_finqa_dataset("/cs/student/projects2/aisd/2024/giliev/FinQA/dataset/dev.json")
test_data = load_finqa_dataset("/cs/student/projects2/aisd/2024/giliev/FinQA/dataset/test.json")

# Convert to DataFrame for easier handling
df_test = pd.DataFrame(test_data)
print(df_test.head())


                                            pre_text  \
0  [entergy corporation and subsidiaries manageme...   
1  [item 1b ., unresolved staff comments not appl...   
2  [undesignated hedges was $ 41.2 million and $ ...   
3  [chairman and a director of the board of fis a...   
4  [performance graph the table below compares th...   

                                           post_text               filename  \
0  [the retail electric price variance is primari...   ETR/2016/page_23.pdf   
1  [1 leases on portions of the land used for the...  INTC/2015/page_41.pdf   
2  [the amounts earned and owed under the swap ag...   ADI/2011/page_61.pdf   
3  [we recorded a preliminary allocation of the p...   FIS/2010/page_70.pdf   
4  [$ 50.00 $ 100.00 $ 150.00 $ 200.00 $ 250.00 $...   MAS/2017/page_27.pdf   

                                           table_ori  \
0  [[, Amount (In Millions)], [2014 net revenue, ...   
1  [[(Square Feet in Millions), UnitedStates, Oth...   
2  [[Statement of In

In [3]:
import json

# Print an example entry from test_data
print(json.dumps(test_data[0], indent=4))


{
    "pre_text": [
        "entergy corporation and subsidiaries management 2019s financial discussion and analysis a result of the entergy louisiana and entergy gulf states louisiana business combination , results of operations for 2015 also include two items that occurred in october 2015 : 1 ) a deferred tax asset and resulting net increase in tax basis of approximately $ 334 million and 2 ) a regulatory liability of $ 107 million ( $ 66 million net-of-tax ) as a result of customer credits to be realized by electric customers of entergy louisiana , consistent with the terms of the stipulated settlement in the business combination proceeding .",
        "see note 2 to the financial statements for further discussion of the business combination and customer credits .",
        "results of operations for 2015 also include the sale in december 2015 of the 583 mw rhode island state energy center for a realized gain of $ 154 million ( $ 100 million net-of-tax ) on the sale and the $ 77 mil

In [4]:
def preprocess_example(example):
    """Prepare structured input for Phi-3"""
    # Extract question and relevant information from table
    question = example["qa"].get("question", "No question available.")
    table = example.get("table", [])
    table_str = "\n".join([" | ".join(row) for row in table])
    
    # Extract text context
    pre_text = " ".join(example.get("pre_text", []))
    post_text = " ".join(example.get("post_text", []))
    
    # Get the gold indices for relevant information
    gold_inds = example["qa"].get("gold_inds", {})
    relevant_info = "\n".join(gold_inds.values())
    
    # Get the mathematical program if available
    program = example["qa"].get("program", "")
    
    # Structured format with explicit instruction
    input_text = (
        "You are a financial AI assistant. Answer the following question using only numbers or percentages.\n"
        "Rules:\n"
        "1. Extract numerical values from the context and table\n"
        "2. Perform any necessary calculations\n"
        "3. Return ONLY the final number or percentage\n"
        "4. Do not include any explanatory text\n\n"
        f"Relevant Information:\n{relevant_info}\n\n"
        f"Table Data:\n{table_str}\n\n"
        f"Question: {question}\n"
        f"Mathematical Operation: {program}\n"
        "Answer (number/percentage only): "
    )

    return tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=2048,  # Increased context length
        return_tensors="pt"
    )

In [13]:
import re

def extract_answer(response_text):
    """Extract the most relevant numerical value."""
    matches = re.findall(r"[-+]?\d*\.?\d+%?", response_text.strip())
    
    if matches:
        return matches[0]  # Return the first match (most likely correct)
    return "Not Found"


In [5]:
def generate_answer(example):
    """Generate an answer using the Phi-3 model"""
    inputs = preprocess_example(example)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=10,  # Slightly increased to allow for decimal numbers
            do_sample=False,    # Deterministic output
            temperature=0.1,    # Very slight randomness to avoid repetition
            top_p=0.95,        # Nuclear sampling
            num_beams=3,       # Beam search for better answer selection
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return clean_answer(generated_text)

In [15]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Load the model with 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
def clean_answer(text):
    """Clean and extract numerical answer"""
    # Remove everything before the last colon if present
    if ':' in text:
        text = text.split(':')[-1]
    
    # First try to find percentage values
    percent_match = re.search(r'[-+]?\d*\.?\d+\s*%', text)
    if percent_match:
        return percent_match.group(0)
    
    # Then try to find decimal numbers
    decimal_match = re.search(r'[-+]?\d*\.?\d+', text)
    if decimal_match:
        return decimal_match.group(0)
    
    # If no numerical value found, return original cleaned text
    return text.strip()

In [2]:
import json
import re
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Model initialization
def initialize_model():
    model_name = "microsoft/Phi-3-mini-4k-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Initialize with 4-bit quantization for better memory efficiency
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        quantization_config=quantization_config
    )
    return model, tokenizer

# Data loading
def load_finqa_dataset(path):
    with open(path, "r") as f:
        return json.load(f)

def preprocess_example(example, tokenizer):
    """Prepare structured input for Phi-3"""
    question = example["qa"].get("question", "No question available.")
    table = example.get("table", [])
    table_str = "\n".join([" | ".join(row) for row in table])
    
    pre_text = " ".join(example.get("pre_text", []))
    post_text = " ".join(example.get("post_text", []))
    
    gold_inds = example["qa"].get("gold_inds", {})
    relevant_info = "\n".join(gold_inds.values())
    program = example["qa"].get("program", "")
    
    input_text = (
        "You are a financial calculator. Follow these steps:\n"
        "1. Read the question carefully\n"
        "2. Look at the relevant information and table data\n"
        "3. Follow the mathematical operation exactly\n"
        "4. Return ONLY the final numerical answer with no text\n\n"
        f"Relevant Information:\n{relevant_info}\n\n"
        f"Table Data:\n{table_str}\n\n"
        f"Question: {question}\n"
        f"Mathematical Operation: {program}\n"
        "Final Answer (number only): "
    )

    return tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=2048,
        return_tensors="pt"
    )

def clean_answer(text):
    """Extract and format numerical answer"""
    if ':' in text:
        text = text.split(':')[-1]
    
    # Handle percentages first
    percent_match = re.search(r'[-+]?\d*\.?\d+\s*%?', text)
    if percent_match:
        number = float(percent_match.group(0).replace('%', '').strip())
        # If the number is small (likely decimal), convert to percentage
        if number < 1:
            number *= 100
        # Round to one decimal place and add % symbol
        return f"{round(number, 1)}%"
    
    # Handle regular numbers
    decimal_match = re.search(r'[-+]?\d*\.?\d+', text)
    if decimal_match:
        number = float(decimal_match.group(0))
        # If it's close to an integer, round it
        if abs(round(number) - number) < 0.01:
            return str(round(number))
        # Otherwise, round to one decimal place
        return str(round(number, 1))
    
    # Handle yes/no answers
    text = text.lower().strip()
    if 'yes' in text or 'true' in text:
        return 'yes'
    if 'no' in text or 'false' in text:
        return 'no'
    
    return text.strip()

def generate_answer(example, model, tokenizer):
    """Generate answer using Phi-3"""
    inputs = preprocess_example(example, tokenizer)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
            num_beams=5,
            temperature=0.1,
            top_p=0.95,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return clean_answer(generated_text)

def evaluate_model(test_data, model, tokenizer, num_samples=10):
    """Evaluate model performance"""
    metric = load_metric("squad")
    predictions = []
    references = []
    
    torch.cuda.empty_cache()
    
    for i, example in enumerate(test_data[:num_samples]):
        try:
            pred_text = generate_answer(example, model, tokenizer)
            true_text = example["qa"]["answer"]
            
            predictions.append({"id": str(i), "prediction_text": pred_text})
            references.append({"id": str(i), "answers": {"text": [true_text], "answer_start": [0]}})
            
            em = 1 if pred_text.strip() == true_text.strip() else 0
            f1 = metric.compute(
                predictions=[{"id": str(i), "prediction_text": pred_text}],
                references=[{"id": str(i), "answers": {"text": [true_text], "answer_start": [0]}}]
            )["f1"]
            
            print(f"\n🔹 Example {i+1}")
            print(f"❓ Question: {example['qa']['question']}")
            print(f"✅ Ground Truth: {true_text}")
            print(f"🤖 Prediction: {pred_text}")
            print(f"📊 Metrics - Exact Match: {em}, F1: {f1:.2f}")
            
        except Exception as e:
            print(f"Error processing example {i}: {str(e)}")
            continue
    
    results = metric.compute(predictions=predictions, references=references)
    print("\n📊 Overall Results:", results)
    return results

def main():
    # Initialize model and tokenizer
    model, tokenizer = initialize_model()
    print("Model initialized successfully!")
    
    # Load datasets
    test_data = load_finqa_dataset("/cs/student/projects2/aisd/2024/giliev/FinQA/dataset/test.json")
    print("Dataset loaded successfully!")
    
    # Evaluate model
    results = evaluate_model(test_data, model, tokenizer, num_samples=10)
    
    # Save results
    with open("evaluation_results.json", "w") as f:
        json.dump(results, f, indent=4)

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model initialized successfully!
Dataset loaded successfully!


  metric = load_metric("squad")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.



🔹 Example 1
❓ Question: what is the net change in net revenue during 2015 for entergy corporation?
✅ Ground Truth: 94
🤖 Prediction: 94.0%
📊 Metrics - Exact Match: 0, F1: 0.00





🔹 Example 2
❓ Question: what percentage of total facilities as measured in square feet are leased?
✅ Ground Truth: 14%
🤖 Prediction: 14.2%
📊 Metrics - Exact Match: 0, F1: 0.00

🔹 Example 3
❓ Question: what is the percentage change in cash flow hedges in 2011 compare to the 2010?
✅ Ground Truth: 9.9%
🤖 Prediction: 9.1%
📊 Metrics - Exact Match: 0, F1: 0.00

🔹 Example 4
❓ Question: what portion of total purchase price is related to stock awards?
✅ Ground Truth: 2.9%
🤖 Prediction: 2.9%
📊 Metrics - Exact Match: 1, F1: 100.00

🔹 Example 5
❓ Question: what was the difference in percentage cumulative total shareholder return on masco common stock versus the s&p 500 index for the five year period ended 2017?
✅ Ground Truth: 
🤖 Prediction: 0.0%
📊 Metrics - Exact Match: 0, F1: 0.00

🔹 Example 6
❓ Question: what was the percentage change in total rental expense under operating leases from july 2 , 2005 to july 1 , 2006?
✅ Ground Truth: 7%
🤖 Prediction: 6.7%
📊 Metrics - Exact Match: 0, F1: 0.00

🔹