In [1]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Tuple
import nltk
from nltk.tokenize import sent_tokenize
import logging

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class FinancialDataProcessor:
    """Enhanced financial document processor with advanced cleaning and chunking."""
    
    def __init__(self, max_chunk_size: int = 512):
        self.max_chunk_size = max_chunk_size
        self.logger = logging.getLogger(__name__)
        
    def clean_financial_text(self, text: str) -> str:
        """Advanced text cleaning for financial documents."""
        if not isinstance(text, str):
            return ""
            
        # Remove HTML tags and entities
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'&[a-zA-Z]+;', '', text)
        
        # Normalize financial numbers (preserve format)
        text = re.sub(r'\$\s+', '$', text)  # Fix spaced currency
        text = re.sub(r'(\d)\s+%', r'\1%', text)  # Fix spaced percentages
        
        # Clean whitespace but preserve structure
        text = re.sub(r'\n\s*\n', '\n\n', text)  # Double newlines
        text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs
        text = text.strip()
        
        # Remove section headers noise
        text = re.sub(r'^[A-Z\s]{3,}$', '', text, flags=re.MULTILINE)
        
        return text
    
    def intelligent_chunking(self, text: str, overlap: int = 50) -> List[Dict]:
        """Smart chunking that preserves financial context."""
        cleaned_text = self.clean_financial_text(text)
        sentences = sent_tokenize(cleaned_text)
        
        chunks = []
        current_chunk = ""
        current_length = 0
        
        for i, sentence in enumerate(sentences):
            sentence_length = len(sentence.split())
            
            # If adding this sentence exceeds limit, save current chunk
            if current_length + sentence_length > self.max_chunk_size and current_chunk:
                chunks.append({
                    'text': current_chunk.strip(),
                    'length': current_length,
                    'chunk_id': len(chunks)
                })
                
                # Start new chunk with overlap
                overlap_text = self._get_overlap_text(current_chunk, overlap)
                current_chunk = overlap_text + " " + sentence
                current_length = len(current_chunk.split())
            else:
                current_chunk += " " + sentence
                current_length += sentence_length
        
        # Add final chunk
        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'length': current_length,
                'chunk_id': len(chunks)
            })
        
        return chunks
    
    def _get_overlap_text(self, text: str, overlap_words: int) -> str:
        """Get last N words for overlap."""
        words = text.split()
        return " ".join(words[-overlap_words:]) if len(words) > overlap_words else text
    
    def extract_financial_metrics(self, text: str) -> Dict:
        """Extract key financial metrics using regex patterns."""
        metrics = {}
        
        # Revenue patterns
        revenue_patterns = [
            r'revenue[s]?\s*[:\-]?\s*\$?([0-9,.]+)\s*(million|billion|m|b)?',
            r'net\s+sales[s]?\s*[:\-]?\s*\$?([0-9,.]+)\s*(million|billion|m|b)?'
        ]
        
        # Profit patterns
        profit_patterns = [
            r'net\s+income[s]?\s*[:\-]?\s*\$?([0-9,.]+)\s*(million|billion|m|b)?',
            r'profit[s]?\s*[:\-]?\s*\$?([0-9,.]+)\s*(million|billion|m|b)?'
        ]
        
        # EPS patterns
        eps_patterns = [
            r'earnings\s+per\s+share[s]?\s*[:\-]?\s*\$?([0-9,.]+)',
            r'eps[s]?\s*[:\-]?\s*\$?([0-9,.]+)'
        ]
        
        text_lower = text.lower()
        
        for pattern in revenue_patterns:
            match = re.search(pattern, text_lower)
            if match:
                metrics['revenue'] = self._normalize_financial_number(match.group(1), match.group(2))
                break
                
        for pattern in profit_patterns:
            match = re.search(pattern, text_lower)
            if match:
                metrics['profit'] = self._normalize_financial_number(match.group(1), match.group(2))
                break
                
        for pattern in eps_patterns:
            match = re.search(pattern, text_lower)
            if match:
                metrics['eps'] = float(match.group(1).replace(',', ''))
                break
        
        return metrics
    
    def _normalize_financial_number(self, number_str: str, unit: str) -> float:
        """Convert financial numbers to standard format."""
        # Clean the number string first
        cleaned_number = number_str.replace(',', '').strip()
    
    # Check if it's actually a valid number
        if not cleaned_number or cleaned_number == '.' or not any(c.isdigit() for c in cleaned_number):
            return 0.0  # Return 0 for invalid numbers
    
        try:
            number = float(cleaned_number)
        
            if unit and unit.lower() in ['billion', 'b']:
                number *= 1e9
            elif unit and unit.lower() in ['million', 'm']:
                number *= 1e6
            
            return number
        except ValueError:
            return 0.0  # Return 0 if conversion fails



In [4]:
import json
import os

# NEW CELL IN data_processor.ipynb
def prepare_streamlit_data():
    """Prepare data specifically for Streamlit interface"""
    
    # Load your existing JSONL files
    companies_data = {}
    
    try:
        with open('/Users/puchku-home/Downloads/Hackathon Project 2/financebench-main/financebench-main/data/financebench_document_information.jsonl', 'r') as f:
            for line in f:
                doc = json.loads(line)
                company = doc.get('company', 'Unknown')
                if company not in companies_data:
                    companies_data[company] = []
                companies_data[company].append(doc)
        
        print(f"✅ Loaded data for {len(companies_data)} companies")
        
        # Save processed data for quick Streamlit access
        with open('streamlit_companies_data.json', 'w') as f:
            json.dump(companies_data, f, indent=2)
            
        return companies_data
        
    except Exception as e:
        print(f"❌ Error processing data: {str(e)}")
        return {}

# Run data preparation
streamlit_data = prepare_streamlit_data()


✅ Loaded data for 40 companies
