In [2]:
import pandas as pd
import PyPDF2
import fitz  # PyMuPDF
import os
import json
from typing import List, Dict, Tuple
import logging
import re
from pathlib import Path

class FinanceBenchProcessor:
    """Process FinanceBench PDFs and questions."""
    
    def __init__(self, pdfs_dir: str, data_dir: str):
        self.pdfs_dir = Path(pdfs_dir)
        self.data_dir = Path(data_dir)
        self.logger = logging.getLogger(__name__)
        
        # Load the dataset
        self.questions_df = pd.read_json(
            self.data_dir / "financebench_open_source.jsonl", 
            lines=True
        )
        self.meta_df = pd.read_json(
            self.data_dir / "financebench_document_information.jsonl", 
            lines=True
        )
        
        # Merge questions with metadata
        self.full_df = pd.merge(self.questions_df, self.meta_df, on="doc_name")
        
    def extract_pdf_text(self, pdf_path: str) -> str:
        """Extract text from PDF using PyMuPDF."""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += f"\n--- Page {page_num + 1} ---\n"
                text += page.get_text()
            
            doc.close()
            return text
            
        except Exception as e:
            self.logger.error(f"Error extracting PDF {pdf_path}: {e}")
            return ""
    
    def load_all_documents(self) -> Dict[str, Dict]:
        """Load all PDF documents and their metadata."""
        documents = {}
        
        for _, row in self.meta_df.iterrows():
            doc_name = row['doc_name']
            
            # Find corresponding PDF file
            pdf_files = list(self.pdfs_dir.glob(f"{doc_name}*.pdf"))
            
            if not pdf_files:
                self.logger.warning(f"PDF not found for {doc_name}")
                continue
            
            pdf_path = pdf_files[0]  # Take first match
            self.logger.info(f"Processing {pdf_path}")
            
            # Extract text
            text = self.extract_pdf_text(str(pdf_path))
            
            if text:
                documents[doc_name] = {
                    'doc_name': doc_name,
                    'company': row['company'],
                    'doc_type': row['doc_type'], 
                    'doc_period': row['doc_period'],
                    'gics_sector': row['gics_sector'],
                    'text': text,
                    'pdf_path': str(pdf_path)
                }
        
        self.logger.info(f"Loaded {len(documents)} documents")
        return documents
    
    def get_questions_for_document(self, doc_name: str) -> List[Dict]:
        """Get all questions for a specific document."""
        doc_questions = self.full_df[self.full_df['doc_name'] == doc_name]
        
        questions = []
        for _, row in doc_questions.iterrows():
            questions.append({
                'financebench_id': row['financebench_id'],
                'question': row['question'],
                'answer': row['answer'],
                'question_type': row['question_type'],
                'question_reasoning': row['question_reasoning'],
                'company': row['company'],
                'evidence': row['evidence'],
                'justification': row['justification']
            })
        
        return questions
    
    def get_all_questions(self) -> List[Dict]:
        """Get all questions from the dataset."""
        questions = []
        
        for _, row in self.questions_df.iterrows():
            questions.append({
                'financebench_id': row['financebench_id'],
                'question': row['question'],
                'answer': row['answer'],
                'question_type': row['question_type'],
                'question_reasoning': row['question_reasoning'],
                'company': row['company'],
                'doc_name': row['doc_name'],
                'evidence': row['evidence'],
                'justification': row['justification']
            })
        
        return questions

print("✅ PDF Processor ready for Streamlit integration")

✅ PDF Processor ready for Streamlit integration
