In [1]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # This was missing in your test

[nltk_data] Downloading package punkt to /Users/puchku-
[nltk_data]     home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/puchku-
[nltk_data]     home/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import json
import requests
import os

# ===============================================
# CONFIGURATION - Update paths and API URL below
# ===============================================

BASE_URL = "http://localhost:5000"  # FinanceBench API address
PDFS_DIR = '/Users/puchku-home/Downloads/Hackathon Project 2/financebench-main/financebench-main/pdfs'
DATA_DIR = '/Users/puchku-home/Downloads/Hackathon Project 2/financebench-main/financebench-main/data'
JSONL_FILE = os.path.join(DATA_DIR, '/Users/puchku-home/Downloads/Hackathon Project 2/financebench-main/financebench-main/data/financebench_open_source.jsonl')

MAX_QUESTIONS = 10 # limit for testing (set None for all questions)

# ===============================================
# 1. Initialize the FinanceBench system
# ===============================================

print("🚀 Initializing FinanceBench system...")
init_resp = requests.post(f"{BASE_URL}/initialize", json={
    "pdfs_dir": PDFS_DIR,
    "data_dir": DATA_DIR
})

if init_resp.status_code != 200:
    print("❌ Initialization failed:", init_resp.text)
    exit(1)
else:
    print("✅ Initialization successful:", init_resp.json().get("message"))

# ===============================================
# 2. Load questions from JSONL file
# ===============================================

questions = []
print(f"📂 Loading questions from {JSONL_FILE}...")
with open(JSONL_FILE, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        questions.append({
            "company": obj.get("company"),
            "question": obj.get("question"),
            "gold_answer": obj.get("answer"),
            "doc_name": obj.get("doc_name")
        })

if MAX_QUESTIONS:
    questions = questions[:MAX_QUESTIONS]

print(f"✅ Loaded {len(questions)} questions for testing.")

# ===============================================
# 3. Send questions to /ask and evaluate
# ===============================================

correct = 0
total = 0

print("\n🔍 Running QA tests...\n")
for i, q in enumerate(questions, start=1):
    payload = {
        "question": q["question"],
        "company": q["company"],
        "top_k": 5
    }

    try:
        resp = requests.post(f"{BASE_URL}/ask", json=payload)
        if resp.status_code != 200:
            print(f"[{i}] ❌ Request failed:", resp.text)
            continue

        result = resp.json()
        predicted = result.get('answer', '').strip()
        confidence = result.get('confidence', 0)
        gold = q["gold_answer"].strip()

        # Simple string matching for evaluation
        is_correct = gold.lower() in predicted.lower() or predicted.lower() in gold.lower()
        if is_correct:
            correct += 1

        total += 1

        print(f"[{i}] Q: {q['question']}")
        print(f"    📄 Company: {q['company']} | Doc: {q['doc_name']}")
        print(f"    🤖 Predicted: {predicted} (conf={confidence:.3f})")
        print(f"    🎯 Gold: {gold}")
        print(f"    ✅ Correct: {is_correct}")
        print("-" * 90)

    except Exception as e:
        print(f"[{i}] ⚠️ Error: {e}")

# ===============================================
# 4. Final Accuracy
# ===============================================

if total > 0:
    accuracy = (correct / total) * 100
    print(f"\n📊 Finished {total} questions")
    print(f"✅ Correct: {correct}")
    print(f"❌ Incorrect: {total - correct}")
    print(f"🎯 Accuracy: {accuracy:.2f}%")
else:
    print("⚠️ No questions processed.")
