In [None]:
# Install full-stack dependencies
!pip install -q edgartools sentence-transformers torch pandas flask sec-cik-mapper google-genai asyncpg python-dotenv langchain langchain-community



####..

##### Chunking+Util+Routing

In [None]:
from edgar import *
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dataclasses import dataclass
from typing import List, Tuple, Callable, Dict, Optional
from dateutil.parser import parse as dtparse
from datetime import date, datetime
from functools import lru_cache

@dataclass
class FormChunk:
    filing_date: None
    parent_filing_date: None
    form_type: str
    section_type: str
    section_number: str
    section_title: str
    content: str
    start_pos: int
    end_pos: int
    cik: str = ""
    ticker: str = ""
    fiscal_year: int = None
    fiscal_quarter: int = None
    chunk_id: str = ""
    content_type: str = ""
    char_count: int = 0
    filing_url: str = ""
    document_url: str = ""
    parent_form_type: str = None
    attachment_number: str = ""
    attachment_description: str = ""
    is_attachment: bool = False
    attachment_type: str = ""

    def __post_init__(self):
        if not self.char_count:
            self.char_count = len(self.content)
        if not self.chunk_id:
            self._generate_chunk_id()

    def _generate_chunk_id(self):
        identifier = self.cik if self.cik else self.ticker
        base = f"{identifier}_{self.form_type}_{self.fiscal_year or 'UNK'}"
        if self.is_attachment:
            # Use attachment number or document name for ID
            attachment_id = self.attachment_number or hash(self.document_url) % 10000
            self.chunk_id = f"{base}_ATT{attachment_id}_{hash(self.content) % 10000:04d}"
        else:
            self.chunk_id = f"{base}_{self.section_number}_{hash(self.content) % 10000:04d}"


def _pre(text: str) -> str:
    return text.replace("\r\n", "\n")

_pat_10k = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:(PART\s+[IVX]+)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(?:ITEM\s+(\d+[A-Z]?)\s*[\.\-]?\s*([^\n\r]+))',
    re.MULTILINE)

def _parse_10k(m: re.Match) -> Tuple[str, str, str]:
    if m.group(1):
        return "PART", m.group(1), (m.group(2) or "").strip()
    return "ITEM", m.group(3), (m.group(4) or "").strip()

_pat_10q = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:(PART\s+[IVX]+)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(?:ITEM\s+(\d+(?:\-[A-Z])?)\s*[\.\-]?\s*([^\n\r]+))',
    re.MULTILINE)

def _parse_10q(m):
    if m.group(1):
        return "PART", m.group(1), (m.group(2) or "").strip()
    return "ITEM", m.group(3), (m.group(4) or "").strip()

_pat_8k = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:ITEM\s+(\d+(?:\.\d+)?[A-Z]?)\s*[\.\-]?\s*([^\n\r]+))'
    r'|(?:^|\n)\s*(SIGNATURE[S]?)\s*$'
    , re.MULTILINE)

def _parse_8k(m):
    if m.group(1):
        return "ITEM", m.group(1), (m.group(2) or "").strip()
    return "SIGNATURE", "", m.group(3).strip()

_pat_345 = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:(TABLE\s+[IVX]+)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(SIGNATURE[S]?)\s*$'
    r'|(?:^|\n)\s*(EXPLANATION\s+OF\s+RESPONSES?)\s*$',
    re.MULTILINE)

def _parse_345(m):
    if m.group(1):
        return "TABLE", m.group(1), (m.group(2) or "").strip()
    if m.group(3):
        return "SIGNATURE", "", m.group(3).strip()
    return "EXPLANATION", "", m.group(4).strip()

_pat_def14a = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:(PROPOSAL\s+(?:NO\.\s*)?\d+)\s*[\.\-]?\s*([^\n\r]+))'
    r'|(?:^|\n)\s*(?:(ITEM\s+\d+)\s*[\.\-]?\s*([^\n\r]+))'
    r'|(?:^|\n)\s*(EXECUTIVE\s+COMPENSATION|CORPORATE\s+GOVERNANCE|'
    r'DIRECTOR\s+NOMINEES?|SECURITY\s+OWNERSHIP)\s*$',
    re.MULTILINE)

def _parse_def14a(m):
    if m.group(1):
        return "PROPOSAL", m.group(1), (m.group(2) or "").strip()
    if m.group(3):
        return "ITEM", m.group(3), (m.group(4) or "").strip()
    return "GOVERNANCE", "", m.group(5).strip()


_REGISTRY: Dict[str, Tuple[re.Pattern, Callable[[re.Match], Tuple[str, str, str]]]] = {
    "10-K":   (_pat_10k,    _parse_10k),
    "10-Q":   (_pat_10q,    _parse_10q),
    "8-K":    (_pat_8k,     _parse_8k),
    "3":      (_pat_345,    _parse_345),
    "4":      (_pat_345,    _parse_345),
    "5":      (_pat_345,    _parse_345),
    "DEF14A": (_pat_def14a, _parse_def14a),
}

_pat_exhibit = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:EXHIBIT\s+(\d+(?:\.\d+)?[A-Z]?)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(SIGNATURE[S]?)\s*$',
    re.MULTILINE)

def _parse_exhibit(m):
    if m.group(1):
        return "EXHIBIT", m.group(1), (m.group(2) or "").strip()
    return "SIGNATURE", "", m.group(3).strip()

_pat_attachment = re.compile(
    r'(?i)(?:^|\n)\s*'
    r'(?:(SCHEDULE\s+[A-Z0-9]+)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(?:(FORM\s+[A-Z0-9\-]+)\s*[\.\-]?\s*([^\n\r]*?)(?:\n|$))'
    r'|(?:^|\n)\s*(SIGNATURE[S]?)\s*$',
    re.MULTILINE)

def _parse_attachment(m):
    if m.group(1):
        return "SCHEDULE", m.group(1), (m.group(2) or "").strip()
    if m.group(3):
        return "FORM", m.group(3), (m.group(4) or "").strip()
    return "SIGNATURE", "", m.group(5).strip()

_REGISTRY["EXHIBIT"] = (_pat_exhibit, _parse_exhibit)
_REGISTRY["ATTACHMENT"] = (_pat_attachment, _parse_attachment)


def chunk_form(text: str, form_type: str, **metadata) -> List[FormChunk]:
    """
    Enhanced chunker with metadata support.

    Args:
        text: Document text
        form_type: SEC form type
        **metadata: Additional metadata (cik, ticker, filing_date, fiscal_year, etc.)
    """
    form_type = form_type.upper().replace(" ", "")
    if form_type not in _REGISTRY:
        raise ValueError(f"Unsupported form: {form_type}")

    pattern, parser = _REGISTRY[form_type]
    text = _pre(text)
    matches = list(pattern.finditer(text))
    chunks: List[FormChunk] = []

    cik = metadata.get('cik', '')
    ticker = metadata.get('ticker', '')
    filing_date = metadata.get('filing_date', '')
    fiscal_year = metadata.get('fiscal_year')
    fiscal_quarter = metadata.get('fiscal_quarter')
    filing_url = metadata.get('filing_url', '')
    document_url = metadata.get('document_url', '')
    is_attachment = metadata.get('is_attachment', False)
    attachment_type = metadata.get('attachment_type', '')
    parent_form_type = metadata.get('parent_form_type')
    parent_filing_date = metadata.get('parent_filing_date')

    if matches and matches[0].start() > 0:
        chunks.append(
            FormChunk(
                form_type=form_type,
                section_type="HEADER",
                section_number="",
                section_title="Document Header",
                content=text[:matches[0].start()],
                start_pos=0,
                end_pos=matches[0].start(),
                cik=cik,
                ticker=ticker,
                filing_date=filing_date,
                fiscal_year=fiscal_year,
                fiscal_quarter=fiscal_quarter,
                filing_url = filing_url,
                document_url = document_url,
                is_attachment=is_attachment,
                attachment_type=attachment_type,
                parent_form_type=parent_form_type,
                parent_filing_date=parent_filing_date,
            )
        )

    for i, m in enumerate(matches):
        section_type, section_number, section_title = parser(m)
        start = m.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        content_type = _infer_content_type(section_type, section_number, section_title)

        chunk = FormChunk(
            form_type=form_type,
            section_type=section_type,
            section_number=section_number,
            section_title=section_title,
            content=text[start:end],
            start_pos=start,
            end_pos=end,
            cik=cik,
            ticker=ticker,
            filing_date=filing_date,
            fiscal_year=fiscal_year,
            fiscal_quarter=fiscal_quarter,
            content_type=content_type,
            filing_url=filing_url,
            document_url=document_url,
            is_attachment=is_attachment,
            attachment_type=attachment_type,
            parent_form_type=parent_form_type,
            parent_filing_date=parent_filing_date,
        )

        if is_attachment:
            chunk.attachment_number = section_number
            chunk.attachment_description = section_title

        chunks.append(chunk)

    if not chunks:
        chunks.append(
            FormChunk(
                form_type=form_type,
                section_type="DOCUMENT",
                section_number="",
                section_title="Entire Document",
                content=text,
                start_pos=0,
                end_pos=len(text),
                cik=cik,
                ticker=ticker,
                filing_date=filing_date,
                fiscal_year=fiscal_year,
                fiscal_quarter=fiscal_quarter,
                filing_url=filing_url,
                document_url=document_url,
                is_attachment=is_attachment,
                attachment_type=attachment_type,
                parent_form_type=parent_form_type,
                parent_filing_date=parent_filing_date,
            )
        )

    return chunks

def _infer_content_type(section_type: str, section_number: str, section_title: str) -> str:
    """Infer content type from section information."""
    title_lower = section_title.lower()

    # 10-K/10-Q specific mappings
    if section_type == "ITEM":
        if "1a" in section_number.lower() or "risk" in title_lower:
            return "risk_factors"
        elif section_number == "1" or "business" in title_lower:
            return "business_overview"
        elif "2" in section_number or "properties" in title_lower:
            return "properties"
        elif "7" in section_number or "md&a" in title_lower:
            return "mda"
    elif section_type == "EXHIBIT":
        return "exhibit"
    elif section_type == "SCHEDULE":
        return "schedule"
    elif section_type == "FORM":
        return "form"
    elif section_type == "SIGNATURE":
        return "signature"

    return "general"

def _determine_attachment_type(attachment) -> str:
    """Determine the type of attachment based on document properties."""
    doc_name = getattr(attachment, 'document', '').upper()
    description = getattr(attachment, 'description', '').upper()

    # Check document name patterns first (most reliable)
    if doc_name.startswith('EX-'):
        return "exhibit"
    elif 'SCHEDULE' in doc_name:
        return "schedule"
    elif any(form in doc_name for form in ['10-', '8-K', 'DEF', 'FORM']):
        return "form"

    # Fallback to description if document name isn't clear
    if 'EXHIBIT' in description:
        return "exhibit"
    elif 'SCHEDULE' in description:
        return "schedule"

    return "attachment"

def process_filing_attachments(filing: object, base_metadata: Dict) -> List[FormChunk]:
    """
    Process filing attachments and return chunks for all useful attachments.
    Extracts metadata directly from the filing object.
    """
    all_chunks = []

    if not hasattr(filing, 'attachments'):
        return all_chunks

    parent_form_type = base_metadata.pop('form_type', None)

    for attachment in filing.attachments:
        # Skip non-useful attachments
        if (attachment.purpose is None) and (attachment.extension != ".htm"):
            continue

        # # Skip the handful of useless XML/XSD files – everything else gets parsed
        # if attachment.extension.lower() in {".xml", ".xsd", ".css"}:
        #     continue


        try:
            attachment_text = attachment.text()
            if not attachment_text or len(attachment_text.strip()) < 50:
                continue

            attachment_type = _determine_attachment_type(attachment)

            attachment_metadata = base_metadata.copy()
            attachment_metadata.update({
                'is_attachment': True,
                'attachment_type': attachment_type,
                'parent_form_type': parent_form_type,
                'parent_filing_date': base_metadata['filing_date'],
                'filing_url': base_metadata['filing_url'],
                'document_url': getattr(attachment, 'url', ''),
                # 'attachment_description': attachment.description, #SAME AS SECTION_TITLE
            })

            chunks = [FormChunk(
                form_type=attachment_type.upper(),
                section_type="DOCUMENT",
                section_number="",
                section_title=getattr(attachment, 'description', 'Attachment'),
                content=attachment_text,
                start_pos=0,
                end_pos=len(attachment_text),
                **attachment_metadata
            )]

            all_chunks.extend(chunks)

        except Exception as e:
            print(f"Error processing attachment {attachment.document}: {e}")
            continue

    return all_chunks

#####Util

In [None]:
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dataclasses import dataclass
from typing import List, Tuple, Callable, Dict, Optional
from dateutil.parser import parse as dtparse
from datetime import date, datetime
from functools import lru_cache


# ─────────────────────────────────────────────────────────────────────────────
#  helpers_meta.py  →  robust, production-grade metadata extraction
# ─────────────────────────────────────────────────────────────────────────────

from sec_cik_mapper import StockMapper          # pip install sec-cik-mapper

_mapper = StockMapper()


# ---------- 1.  CIK ⇄ Ticker look-ups  (cached) -----------------------------
@lru_cache(maxsize=4_096)
def cik_to_ticker(cik: int) -> Optional[str]:
    try:
        # return _mapper.lookup_by_cik(str(cik))
        ticketSet = _mapper.cik_to_tickers[cik.zfill(10)]
        list_of_strings = [str(item) for item in ticketSet]
        return list_of_strings[0]
    except Exception:
        return None

@lru_cache(maxsize=4_096)
def ticker_to_cik(ticker: str) -> Optional[int]:
    try:
        # return int(_mapper.lookup(ticker) or 0)
        return _mapper.ticker_to_cik[ticker]
    except Exception:
        return None

_RE_FY = re.compile(
    r'for\s+(?:the\s+)?fiscal\s+year\s+ended\s+([\w\s,]{5,40})',
    re.I | re.S)
_RE_Q  = re.compile(
    r'for\s+(?:the\s+)?quarter\s+ended\s+([\w\s,]{5,40})',
    re.I | re.S)

def _extract_xbrl_fiscal_info(filing) -> Tuple[Optional[int], Optional[int]]:
    """
    Extract fiscal year and quarter directly from XBRL using proper edgartools API.
    Returns (fiscal_year, fiscal_quarter) or (None, None) if extraction fails.
    """
    try:
        # Method 1: Use query().by_concept() - more reliable
        doc_period_df = filing.xbrl().facts.query().by_concept("DocumentPeriodEndDate").to_dataframe()
        if not doc_period_df.empty:
            end_date_str = doc_period_df["value"].iloc[0]
            end_date = dtparse(end_date_str).date()
            fiscal_year = end_date.year

            # Try to get DocumentFiscalPeriodFocus directly from XBRL
            try:
                period_focus_df = filing.xbrl().facts.query().by_concept("DocumentFiscalPeriodFocus").to_dataframe()
                if not period_focus_df.empty:
                    period_focus = period_focus_df["value"].iloc[0].strip().upper()

                    # Parse fiscal period focus
                    if period_focus == "FY":
                        fiscal_quarter = None
                    elif period_focus.startswith("Q"):
                        fiscal_quarter = int(period_focus[1])  # Q1 -> 1, Q2 -> 2, etc.
                    else:
                        # Fallback to calendar quarter calculation
                        fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None
                else:
                    # No DocumentFiscalPeriodFocus found, use form type logic
                    fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None

            except Exception:
                # Fallback to calendar quarter calculation
                fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None

            return fiscal_year, fiscal_quarter

    except Exception:
        pass

    try:
        # Method 2: Use search_facts() as fallback
        doc_period_results = filing.xbrl().facts.search_facts("DocumentPeriodEndDate")
        if doc_period_results and len(doc_period_results) > 0:
            end_date_str = doc_period_results[0].value
            end_date = dtparse(end_date_str).date()
            fiscal_year = end_date.year

            # Try DocumentFiscalPeriodFocus with search_facts
            try:
                period_focus_results = filing.xbrl().facts.search_facts("DocumentFiscalPeriodFocus")
                if period_focus_results and len(period_focus_results) > 0:
                    period_focus = period_focus_results[0].value.strip().upper()

                    if period_focus == "FY":
                        fiscal_quarter = None
                    elif period_focus.startswith("Q"):
                        fiscal_quarter = int(period_focus[1])
                    else:
                        fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None
                else:
                    fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None

            except Exception:
                fiscal_quarter = ((end_date.month - 1) // 3 + 1) if filing.form.upper() == "10-Q" else None

            return fiscal_year, fiscal_quarter

    except Exception:
        pass

    return None, None

def _extract_fiscal_info(text: str, filing=None) -> Tuple[Optional[int], Optional[int]]:
    """
    Robust fiscal year / quarter extraction.
    1. XBRL DocumentPeriodEndDate + DocumentFiscalPeriodFocus (authoritative)
    2. Cover-page regex fallback for older filings
    Returns (fiscal_year, fiscal_quarter); quarter is None for annual filings.
    """

    # ── 1 ─ try XBRL extraction first (most reliable) ─────────────────────
    if filing is not None:
        fiscal_year, fiscal_quarter = _extract_xbrl_fiscal_info(filing)
        if fiscal_year is not None:
            return fiscal_year, fiscal_quarter

    # ── 2 ─ regex fallback on plain-text cover page ───────────────────────
    header = " ".join(text[:8_000].split())     # normalize whitespace

    m = re.search(r'for\s+(?:the\s+)?fiscal\s+year\s+ended\s+([\w\s,]{5,40})',
                  header, re.I)
    if m:
        try:
            fy = dtparse(m.group(1)).year
            return fy, None
        except Exception:
            pass

    m = re.search(r'for\s+(?:the\s+)?quarter\s+ended\s+([\w\s,]{5,40})',
                  header, re.I)
    if m:
        try:
            end_dt = dtparse(m.group(1))
            fy = end_dt.year
            fq = (end_dt.month - 1) // 3 + 1
            return fy, fq
        except Exception:
            pass

    # ── 3 ─ give up gracefully ─────────────────────────────────────────────
    return None, None


def _build_document_url(filing) -> str:
    """Construct direct document URL with robust error handling."""
    try:
        cik = str(filing.cik)
        accession = str(filing.accession_no).replace('-', '')
        primary_doc = getattr(filing, 'primary_document', '')
        is_xbrl = getattr(filing, 'is_inline_xbrl', False)

        if not primary_doc:
            return getattr(filing, 'url', '')

        base_path = f"Archives/edgar/data/{cik}/{accession}/{primary_doc}"
        prefix = "ix?doc=/" if is_xbrl else ""

        return f"https://www.sec.gov/{prefix}{base_path}"

    except Exception:
        return getattr(filing, 'url', '')


# ---------- 3.  Single entry-point ------------------------------------------
def build_base_metadata(filing, main_text: str) -> Dict:
    """
    Construct a complete metadata dict for chunker / DB insertion.
    Handles ALL common EDGAR forms (3,4,5, 8-K, 10-K, 10-Q, DEF14A …).
    """
    fiscal_year, fiscal_quarter = _extract_fiscal_info(main_text)
    return {
        "cik":            str(filing.cik),
        "ticker":         cik_to_ticker(str(filing.cik)) or '',
        "form_type":      filing.form,                     # e.g. "10-K"
        "filing_date":    str(filing.filing_date),         # YYYY-MM-DD
        "fiscal_year":    fiscal_year,
        "fiscal_quarter": fiscal_quarter,
        "filing_url": getattr(filing, 'url', ''),           # Directory listing
        "document_url": _build_document_url(filing),        # Direct document
    }



###############################################################################
# 2. SEC Rate Limiter (Fixes SEC API Violation)
###############################################################################

class SECRateLimiter:
    """Enforces SEC's 10 requests/second limit"""

    def __init__(self, max_requests_per_second: int = 10):
        self.max_rps = max_requests_per_second
        self.requests = []
        self._lock = threading.Lock()

    async def acquire(self) -> None:
        """Wait if necessary to respect rate limit"""
        with self._lock:
            now = time.time()

            # Remove requests older than 1 second
            self.requests = [req_time for req_time in self.requests if now - req_time < 1.0]

            if len(self.requests) >= self.max_rps:
                sleep_time = 1.0 - (now - self.requests[0])
                if sleep_time > 0:
                    await asyncio.sleep(sleep_time)

            self.requests.append(now)

# sec_limiter = SECRateLimiter()

#####ROUTING

In [None]:
from abc import ABC, abstractmethod
from typing import Dict, List, Set, Optional, Protocol, Tuple, Union
from enum import Enum
import calendar
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dataclasses import dataclass
from dateutil.parser import parse as dtparse
from datetime import date, datetime
from functools import lru_cache
from sec_cik_mapper import StockMapper          # pip install sec-cik-mapper
# from utilities import *

class FormType(Enum):
    FORM_10K = "10-K"
    FORM_10Q = "10-Q"
    FORM_8K = "8-K"
    FORM_3 = "3"
    FORM_4 = "4"
    FORM_5 = "5"
    FORM_DEF14A = "DEF14A"

@dataclass(frozen=True)
class SectionIdentifier:
    form_type: FormType
    section_type: str  # ITEM, PART, PROPOSAL, etc.
    section_number: str  # 1A, I, 2.02, etc.

    def __str__(self) -> str:
        return f"{self.form_type.value} {self.section_type} {self.section_number}".strip()

@dataclass(frozen=True)
class ConceptMapping:
    """Maps a concept to sections across multiple forms"""
    concept: str
    sections: frozenset[SectionIdentifier]
    confidence: float = 1.0
    aliases: frozenset[str] = frozenset()
    temporal_indicators: frozenset[str] = frozenset()  # "quarterly", "annual", "recent"

@dataclass
class RoutingResult:
    """Result of routing with form and section recommendations"""
    primary_targets: List[SectionIdentifier]  # Best matches
    secondary_targets: List[SectionIdentifier]  # Fallback options
    confidence_scores: Dict[SectionIdentifier, float]
    reasoning: str  # Human-readable explanation

class UniversalConceptRepository:
    """Cross-form concept mappings with temporal and contextual awareness"""

    def __init__(self):
        self._concepts: Dict[str, ConceptMapping] = {}
        self._load_universal_concepts()

    def _load_universal_concepts(self) -> None:
        """Load comprehensive cross-form concept mappings"""

        mappings = [
            # ===== CORE BUSINESS & STRATEGY =====
            ConceptMapping(
                "business_overview",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.90,
                aliases=frozenset(["business_description", "company_overview", "operations", "what_does_company_do",
                                 "business_model", "corporate_overview", "business_activities", "primary_business"]),
                temporal_indicators=frozenset(["annual", "detailed", "comprehensive"])
            ),

            ConceptMapping(
                "competitive_positioning",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1A"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                ]),
                confidence=0.83,
                aliases=frozenset(["competitive_advantages", "competitive_advantage", "market_position", "competitive_strategy",
                                 "differentiation", "competitive_landscape", "market_share", "competitive_strengths",
                                 "strategic_positioning", "moat", "strategic_advantages"]),
                temporal_indicators=frozenset(["strategic", "current", "annual"])
            ),

            # ===== CAPITAL STRUCTURE & FINANCING =====
            ConceptMapping(
                "debt_and_credit_facilities",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "1.01"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "2.03"),
                ]),
                confidence=0.90,
                aliases=frozenset(["debt", "credit_facilities", "borrowings", "loans", "credit_agreements",
                                 "revolving_credit", "term_loans", "bonds", "notes", "debt_covenants",
                                 "credit_rating", "debt_maturity", "interest_expense"]),
                temporal_indicators=frozenset(["quarterly", "annual", "recent", "new"])
            ),

            ConceptMapping(
                "capital_allocation",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "2.02"),
                ]),
                confidence=0.88,
                aliases=frozenset(["dividends", "share_buybacks", "capital_investments", "acquisitions",
                                 "capex", "capital_expenditures", "cash_deployment", "shareholder_returns",
                                 "investment_strategy", "capital_priorities"]),
                temporal_indicators=frozenset(["annual", "quarterly", "strategic", "recent"])
            ),

            # ===== FINANCIAL PERFORMANCE & METRICS =====
            ConceptMapping(
                "financial_performance",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "2.02"),
                ]),
                confidence=0.95,
                aliases=frozenset(["revenue", "earnings", "profitability", "financial_results", "operating_performance",
                                 "financial_metrics", "key_metrics", "performance_indicators", "financial_condition",
                                 "revenue_drivers", "revenue_sources", "revenue_segments", "primary_revenue",
                                 "sales", "income", "profit"]),
                temporal_indicators=frozenset(["quarterly", "annual", "recent", "latest"])
            ),

            ConceptMapping(
                "cash_flow_and_liquidity",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                ]),
                confidence=0.88,
                aliases=frozenset(["cash_flow", "operating_cash_flow", "free_cash_flow", "liquidity", "cash_position",
                                 "working_capital", "cash_management", "liquidity_management", "cash_generation",
                                 "working_capital_changes", "current_assets", "current_liabilities"]),
                temporal_indicators=frozenset(["quarterly", "annual"])
            ),

            ConceptMapping(
                "segment_performance",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                ]),
                confidence=0.92,
                aliases=frozenset(["business_segments", "operating_segments", "geographic_segments",
                                 "product_lines", "divisional_performance", "segment_revenue",
                                 "segment_margins", "segment_profitability"]),
                temporal_indicators=frozenset(["quarterly", "annual"])
            ),

            ConceptMapping(
                "cost_structure",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),
                ]),
                confidence=0.85,
                aliases=frozenset(["cost_of_goods_sold", "operating_expenses", "sg&a", "cost_reduction",
                                 "efficiency_initiatives", "margin_improvement", "cost_control",
                                 "operating_leverage"]),
                temporal_indicators=frozenset(["quarterly", "annual"])
            ),

            # ===== RISK MANAGEMENT & COMPLIANCE =====
            ConceptMapping(
                "risk_factors",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1A"),
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1A"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.95,
                aliases=frozenset(["risks", "uncertainties", "threats", "challenges", "risk_management", "risk_factors",
                                 "business_risks", "operational_risks", "market_risks", "credit_risks", "regulatory_risks",
                                 "cybersecurity_risks", "supply_chain_risks", "risk_assessment"]),
                temporal_indicators=frozenset(["annual", "recent", "emerging", "yearly", "comprehensive"])
            ),

            ConceptMapping(
                "regulatory_and_legal",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "3"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10Q, "PART II", "ITEM 1"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.87,
                aliases=frozenset(["legal_proceedings", "litigation", "regulatory_compliance", "government_regulation",
                                 "lawsuits", "settlements", "regulatory_changes", "compliance_costs", "regulatory_actions",
                                 "legal_issues", "court_cases", "legal_matters"]),
                temporal_indicators=frozenset(["ongoing", "recent", "current"])
            ),

            # ===== GOVERNANCE & EXECUTIVE MANAGEMENT =====
            ConceptMapping(
                "executive_compensation",
                frozenset([
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "11"),
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "12"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "11"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "5.02"),
                ]),
                confidence=0.95,
                aliases=frozenset(["executive_pay", "ceo_compensation", "management_compensation", "salary", "bonus",
                                 "stock_options", "equity_awards", "incentive_plans", "compensation_philosophy",
                                 "pay_for_performance", "ceo_pay", "executive_salary", "compensation"]),
                temporal_indicators=frozenset(["annual", "yearly"])
            ),

            ConceptMapping(
                "corporate_governance",
                frozenset([
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "10"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "10"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "5.02"),
                ]),
                confidence=0.90,
                aliases=frozenset(["board_of_directors", "corporate_governance", "board_independence", "board_committees",
                                 "director_qualifications", "governance_policies", "board_oversight", "shareholder_rights",
                                 "board", "directors", "governance", "board_composition", "independent_directors"]),
                temporal_indicators=frozenset(["annual", "current"])
            ),

            ConceptMapping(
                "ownership_structure",
                frozenset([
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "12"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "12"),
                    SectionIdentifier(FormType.FORM_3, "INITIAL", "OWNERSHIP"),
                ]),
                confidence=0.85,
                aliases=frozenset(["beneficial_ownership", "major_shareholders", "shareholding", "ownership",
                                 "equity_ownership", "stock_ownership", "shareholder_structure", "ownership_table"]),
                temporal_indicators=frozenset(["current", "as_of"])
            ),

            # ===== INSIDER ACTIVITY =====
            ConceptMapping(
                "insider_trading",
                frozenset([
                    SectionIdentifier(FormType.FORM_4, "OWNERSHIP", "NON_DERIVATIVE"),
                    SectionIdentifier(FormType.FORM_4, "OWNERSHIP", "DERIVATIVE"),
                    SectionIdentifier(FormType.FORM_5, "ANNUAL", "SUMMARY"),
                    SectionIdentifier(FormType.FORM_3, "INITIAL", "OWNERSHIP"),
                ]),
                confidence=0.90,
                aliases=frozenset(["insider_transactions", "insider_buying", "insider_selling", "officer_trading",
                                 "director_trading", "executive_trading", "stock_transactions", "insider_activity",
                                 "share_purchases", "share_sales", "stock_sales", "stock_purchases"]),
                temporal_indicators=frozenset(["recent", "current", "latest"])
            ),

            # ===== STRATEGIC INITIATIVES & INNOVATION =====
            ConceptMapping(
                "mergers_acquisitions",
                frozenset([
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "1.01"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "2.01"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                ]),
                confidence=0.92,
                aliases=frozenset(["mergers", "acquisitions", "divestitures", "asset_sales", "strategic_transactions",
                                 "business_combinations", "joint_ventures", "strategic_partnerships", "M&A", "deal_rationale",
                                 "merger", "acquisition", "takeover", "asset_purchase", "divestiture", "spin_off",
                                 "deal", "transaction", "strategic_rationale", "business_combination"]),
                temporal_indicators=frozenset(["recent", "announced", "completed"])
            ),

            ConceptMapping(
                "research_development",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),    # Business overview mentions R&D
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),    # MD&A discusses R&D spending
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "8"),    # Financial statements show R&D expenses
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "1"),    # Quarterly financials
                    SectionIdentifier(FormType.FORM_10Q, "ITEM", "2"),    # Quarterly MD&A
                ]),
                confidence=0.85,
                aliases=frozenset([
                    "r&d", "R&D", "research", "development", "r_d", "research_and_development", "technology_development",
                    "rd_spending", "research_costs", "development_costs", "innovation", "new_product_development", "innovation_investment",
                    "research_expenses", "development_expenses", "rd_investment", "R&D_spending", "research_spending",
                ]),
                temporal_indicators=frozenset(["spending", "costs", "expenses", "investment", "annual", "quarterly"])
            ),

            ConceptMapping(
                "technology_and_innovation",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.85,
                aliases=frozenset(["technology", "digital_transformation", "AI", "artificial_intelligence", "automation",
                                 "innovation_strategy", "technology_platform", "digital_capabilities", "tech_infrastructure",
                                 "machine_learning", "automation_strategies", "digital_initiatives", "tech_positioning"]),
                temporal_indicators=frozenset(["strategic", "emerging", "recent"])
            ),

            # ===== ESG & SUSTAINABILITY =====
            ConceptMapping(
                "esg_and_sustainability",
                frozenset([
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1A"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "7"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.80,
                aliases=frozenset(["ESG", "sustainability", "environmental", "social_responsibility", "climate_change",
                                 "carbon_emissions", "diversity_inclusion", "corporate_responsibility", "sustainable_practices",
                                 "climate", "climate_related_risks", "environmental_risks", "carbon_footprint"]),
                temporal_indicators=frozenset(["annual", "strategic", "emerging"])
            ),

            # ===== CORPORATE CHANGES & EVENTS =====
            ConceptMapping(
                "material_agreements",
                frozenset([
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "1.01"),
                    SectionIdentifier(FormType.FORM_10K, "ITEM", "1"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "8.01"),
                ]),
                confidence=0.90,
                aliases=frozenset(["contracts", "agreements", "partnerships", "deals", "material_contracts",
                                 "strategic_agreements", "joint_ventures", "alliances", "licensing_agreements"]),
                temporal_indicators=frozenset(["recent", "new", "latest"])
            ),

            ConceptMapping(
                "management_changes",
                frozenset([
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "5.02"),
                    SectionIdentifier(FormType.FORM_8K, "ITEM", "5.01"),
                    SectionIdentifier(FormType.FORM_DEF14A, "ITEM", "1"),
                ]),
                confidence=0.85,
                aliases=frozenset(["executive_changes", "ceo_change", "management_turnover", "leadership_changes",
                                 "officer_changes", "director_changes", "management_succession", "executive_appointments"]),
                temporal_indicators=frozenset(["recent", "new", "latest"])
            ),

            # ===== SHAREHOLDER MATTERS =====
            ConceptMapping(
                "shareholder_proposals",
                frozenset([
                    SectionIdentifier(FormType.FORM_DEF14A, "PROPOSAL", ""),
                ]),
                confidence=0.95,
                aliases=frozenset(["proposals", "voting_matters", "proxy_voting", "shareholder_votes",
                                 "shareholder_proposals", "ballot_items", "voting_items"]),
                temporal_indicators=frozenset(["upcoming", "annual"])
            ),
        ]


        for mapping in mappings:
            self._concepts[mapping.concept] = mapping
            # Also add aliases as separate entries for easier lookup
            for alias in mapping.aliases:
                if alias not in self._concepts:  # Don't overwrite existing primary concepts
                    alias_mapping = ConceptMapping(
                        alias, mapping.sections, mapping.confidence * 0.9,
                        frozenset(), mapping.temporal_indicators
                    )
                    self._concepts[alias] = alias_mapping

    def get_concept(self, concept_name: str) -> Optional[ConceptMapping]:
        return self._concepts.get(concept_name)

    def get_all_concepts(self) -> Dict[str, ConceptMapping]:
        return self._concepts.copy()


class TemporalMatcher:
    """Matches temporal indicators to determine form preferences"""

    TEMPORAL_FORM_PREFERENCES = {
        # Recent/current events favor 8-K and most recent periodic reports
        frozenset(["recent", "latest", "current", "new", "announced"]): [
            FormType.FORM_8K, FormType.FORM_10Q, FormType.FORM_4
        ],

        # Quarterly indicators
        frozenset(["quarterly", "quarter", "q1", "q2", "q3", "q4"]): [
            FormType.FORM_10Q, FormType.FORM_8K
        ],

        # Annual indicators
        frozenset(["annual", "yearly", "comprehensive", "detailed"]): [
            FormType.FORM_10K, FormType.FORM_DEF14A, FormType.FORM_5
        ],

        # Voting/governance timing
        frozenset(["upcoming", "proposed", "proxy"]): [
            FormType.FORM_DEF14A
        ]
    }

    def get_temporal_preferences(self, query: str) -> List[FormType]:
        """Return form types preferred based on temporal indicators in query"""
        query_lower = query.lower()
        preferences = []

        for indicators, forms in self.TEMPORAL_FORM_PREFERENCES.items():
            if any(indicator in query_lower for indicator in indicators):
                preferences.extend(forms)

        return preferences

class CrossFormRouter:
    """Main routing service that determines both form and section"""

    def __init__(self):
        self.repository = UniversalConceptRepository()
        self.temporal_matcher = TemporalMatcher()

        # Enhanced synonym matching
        self.fuzzy_synonyms = {
            "compensation": ["pay", "salary", "remuneration", "earnings", "income"],
            "risk": ["danger", "threat", "uncertainty", "hazard", "exposure"],
            "performance": ["results", "outcomes", "success", "achievement"],
            "ownership": ["shareholding", "stake", "holdings", "equity"],
            "legal": ["court", "lawsuit", "litigation", "judicial"],
            "management": ["leadership", "executives", "administration"],
            "research": ["R&D", "development", "innovation", "technology"],
            "capital": ["liquidity", "cash", "working capital", "funds"],
            "artificial_intelligence": ["AI", "automation", "machine learning", "digital"],
            "competitive": ["advantage", "positioning", "differentiation", "strategy"]
        }

    @lru_cache(maxsize=1000)
    def route_question(self, question: str, confidence_threshold: float = 0.3) -> RoutingResult:
        """
        Route a natural language question to specific form sections

        Args:
            question: User's natural language question
            confidence_threshold: Minimum confidence for inclusion

        Returns:
            RoutingResult with prioritized sections and reasoning
        """
        question_lower = question.lower()

        # Get temporal preferences
        temporal_preferences = self.temporal_matcher.get_temporal_preferences(question)

        # Find matching concepts
        concept_matches = self._find_concept_matches(question_lower)

        # Score and rank sections
        section_scores = self._calculate_section_scores(
            concept_matches, temporal_preferences, question_lower
        )

        # Filter by confidence threshold
        qualified_sections = {
            section: score for section, score in section_scores.items()
            if score >= confidence_threshold
        }

        # Sort by score
        sorted_sections = sorted(qualified_sections.items(), key=lambda x: x[1], reverse=True)

        # Split into primary and secondary targets
        primary_targets = [section for section, score in sorted_sections[:3]]
        secondary_targets = [section for section, score in sorted_sections[3:6]]

        # Generate reasoning
        reasoning = self._generate_reasoning(concept_matches, temporal_preferences, primary_targets)

        return RoutingResult(
            primary_targets=primary_targets,
            secondary_targets=secondary_targets,
            confidence_scores=dict(sorted_sections),
            reasoning=reasoning
        )

    def _find_concept_matches(self, query_lower: str) -> List[Tuple[str, ConceptMapping, float]]:
        """Find concepts that match the query with confidence scores"""
        matches = []

        for concept_name, mapping in self.repository.get_all_concepts().items():
            confidence = 0.0

            # Direct concept name match
            if concept_name.replace("_", " ") in query_lower:
                confidence = 1.0

            # Alias matching
            elif any(alias.replace("_", " ") in query_lower for alias in mapping.aliases):
                confidence = 0.9

            # Fuzzy synonym matching
            elif concept_name in self.fuzzy_synonyms:
                synonym_matches = sum(1 for syn in self.fuzzy_synonyms[concept_name]
                                    if syn in query_lower)
                if synonym_matches > 0:
                    confidence = 0.7 * (synonym_matches / len(self.fuzzy_synonyms[concept_name]))

            if confidence > 0:
                matches.append((concept_name, mapping, confidence))

        return matches

    def _calculate_section_scores(self, concept_matches: List[Tuple[str, ConceptMapping, float]],
                                temporal_preferences: List[FormType], query: str) -> Dict[SectionIdentifier, float]:
        """Calculate final scores for each section considering all factors"""
        section_scores = {}

        for concept_name, mapping, match_confidence in concept_matches:
            base_score = match_confidence * mapping.confidence

            for section in mapping.sections:
                current_score = section_scores.get(section, 0.0)

                # Apply temporal boost
                temporal_boost = 1.2 if section.form_type in temporal_preferences else 1.0

                # Apply query-specific boosts
                query_boost = self._get_query_specific_boost(section, query)

                final_score = base_score * temporal_boost * query_boost
                section_scores[section] = max(current_score, final_score)

        return section_scores

    def _get_query_specific_boost(self, section: SectionIdentifier, query: str) -> float:
        """Apply boosts based on query-specific indicators"""
        boost = 1.0
        query_lower = query.lower()

        # Question type boosts
        if any(word in query_lower for word in ["how much", "what did", "amount"]):
            if "compensation" in query_lower and section.form_type == FormType.FORM_DEF14A:
                boost *= 1.3

        if any(word in query_lower for word in ["recent", "latest", "new"]):
            if section.form_type in [FormType.FORM_8K, FormType.FORM_4]:
                boost *= 1.3

        if "annual" in query_lower:
            if section.form_type in [FormType.FORM_10K, FormType.FORM_DEF14A]:
                boost *= 1.2

        return boost

    def _generate_reasoning(self, concept_matches: List[Tuple[str, ConceptMapping, float]],
                          temporal_preferences: List[FormType],
                          primary_targets: List[SectionIdentifier]) -> str:
        """Generate human-readable explanation of routing decisions"""
        if not primary_targets:
            return "No strong matches found for the query."

        reasoning_parts = []

        # Primary concept identified
        if concept_matches:
            top_concept = concept_matches[0][0].replace("_", " ").title()
            reasoning_parts.append(f"Identified primary concept: {top_concept}")

        # Temporal reasoning
        if temporal_preferences:
            forms = [f.value for f in temporal_preferences[:2]]
            reasoning_parts.append(f"Query suggests focus on: {', '.join(forms)}")

        # Section recommendations
        if primary_targets:
            sections = [str(target) for target in primary_targets[:2]]
            reasoning_parts.append(f"Best matches: {', '.join(sections)}")

        return ". ".join(reasoning_parts)


def route_user_question(question: str, available_chunks: List['FormChunk']) -> Tuple[List['FormChunk'], str]:
    """
    Main integration function - routes user question to relevant chunks across all forms

    Args:
        question: User's natural language question
        available_chunks: All chunks from all forms for the company

    Returns:
        Tuple of (relevant_chunks, explanation)
    """
    router = CrossFormRouter()
    routing_result = router.route_question(question)

    if not routing_result.primary_targets:
        # Fallback to content-length based filtering
        fallback_chunks = [chunk for chunk in available_chunks if len(chunk.content) > 1000]
        return fallback_chunks[:5], "No specific sections identified - returning substantial content chunks"

    relevant_chunks = []

    # First priority: exact matches
    for target in routing_result.primary_targets:
        for chunk in available_chunks:
            if (chunk.form_type == target.form_type.value and
                chunk.section_type == target.section_type and
                target.section_number in chunk.section_number):
                relevant_chunks.append(chunk)

    # Second priority: partial matches if we don't have enough
    if len(relevant_chunks) < 3:
        for target in routing_result.secondary_targets:
            for chunk in available_chunks:
                if (chunk.form_type == target.form_type.value and
                    chunk.section_type == target.section_type):
                    if chunk not in relevant_chunks:
                        relevant_chunks.append(chunk)

    return relevant_chunks, routing_result.reasoning


@dataclass(frozen=True)
class TickerContext:
    """Ticker/entity identification"""
    tickers: frozenset[str]  # {"AAPL", "MSFT"}
    comparison_mode: bool  # True if comparing multiple entities

class TemporalScope(Enum):
    SPECIFIC_YEAR = "specific_year"      # "2022", "2021"
    SPECIFIC_QUARTER = "specific_quarter" # "Q1 2023"
    RECENT = "recent"                    # "recent", "latest"
    HISTORICAL_TREND = "historical"      # "over time", "trend"
    ANNUAL = "annual"                    # "annual", "yearly"
    QUARTERLY = "quarterly"              # "quarterly"

@dataclass(frozen=True)
class TemporalContext:
    """Temporal scope and preferences"""
    scope: TemporalScope
    specific_year: Optional[int] = None
    specific_quarter: Optional[int] = None  # 1, 2, 3, 4
    year_range: Optional[Tuple[int, int]] = None  # (start_year, end_year)

class DocumentScope(Enum):
    ANY = "any"
    FORM_10K = "10k"
    FORM_10Q = "10q"
    FORM_8K = "8k"
    FORM_DEF14A = "def14a"
    FORM_3 = "3"
    FORM_4 = "4"
    FORM_5 = "5"


@dataclass(frozen=True)
class DocumentContext:
    """Document type specification"""
    scope: DocumentScope
    must_include: frozenset[FormType] = frozenset()
    exclude: frozenset[FormType] = frozenset()

@dataclass(frozen=True)
class QueryContext:
    """Complete query context"""
    ticker_context: TickerContext
    temporal_context: TemporalContext
    document_context: DocumentContext
    original_query: str

class TickerExtractor:
    """Fixed ticker extraction with proper validation"""

    # Comprehensive company name to ticker mapping
    COMPANY_NAME_MAP = {
        # Major tech companies
        r'\b(?:apple|apple\s+inc\.?)\b': 'AAPL',
        r'\b(?:microsoft|microsoft\s+corp\.?)\b': 'MSFT',
        r'\b(?:google|alphabet|alphabet\s+inc\.?)\b': 'GOOGL',
        r'\b(?:amazon|amazon\.com)\b': 'AMZN',
        r'\b(?:tesla|tesla\s+inc\.?)\b': 'TSLA',
        r'\b(?:meta|facebook|meta\s+platforms)\b': 'META',
        r'\b(?:nvidia|nvidia\s+corp\.?)\b': 'NVDA',
        r'\b(?:netflix|netflix\s+inc\.?)\b': 'NFLX',

        # Financial companies
        r'\b(?:jpmorgan|jp\s+morgan|jpm)\b': 'JPM',
        r'\b(?:bank\s+of\s+america|bofa)\b': 'BAC',
        r'\b(?:wells\s+fargo)\b': 'WFC',
        r'\b(?:goldman\s+sachs)\b': 'GS',

        # Other major companies
        r'\b(?:berkshire\s+hathaway)\b': 'BRK-A',
        r'\b(?:johnson\s+&\s+johnson|j&j)\b': 'JNJ',
        r'\b(?:procter\s+&\s+gamble|p&g)\b': 'PG',
        r'\b(?:coca[\-\s]?cola)\b': 'KO',
    }

    _mapper = StockMapper()
    # Known ticker symbols
    VALID_TICKERS = _mapper.ticker_to_cik.keys()

    # Words that are definitely NOT tickers (common false positives)
    EXCLUDED_WORDS = {
        'WHAT', 'ARE', 'THE', 'AND', 'FOR', 'HAS', 'HOW', 'OVER', 'TIME',
        'AT', 'IN', 'ON', 'TO', 'OF', 'IS', 'WAS', 'BEEN', 'HAVE', 'HAD', 'R&D',
        'WILL', 'CAN', 'MAY', 'SHOULD', 'COULD', 'WOULD', 'RISK', 'RISKS', 'M&A',
        'FROM', 'WITH', 'BY', 'AS', 'AN', 'A', 'THIS', 'THAT', 'THESE', 'THOSE'
    }

    COMPARISON_INDICATORS = [
        r'\bcompare\b', r'\bcomparison\b', r'\bvs\.?\b', r'\bversus\b',
        r'\bagainst\b', r'\brelative\s+to\b', r'\bcompared\s+to\b',
        r'\bdifference\s+between\b', r'\bboth\s+\w+\s+and\b', r'\beither\s+\w+\s+or\b'
    ]

    def extract(self, query: str) -> TickerContext:
        """Extract ticker context with proper validation"""
        found_tickers = set()

        # 1. Extract by company name patterns
        query_lower = query.lower()
        for pattern, ticker in self.COMPANY_NAME_MAP.items():
            if re.search(pattern, query_lower):
                found_tickers.add(ticker)

        # 2. Extract explicit ticker mentions (more restrictive)
        # Look for patterns like "AAPL's" or "(AAPL)" or "AAPL stock"
        ticker_patterns = [
            r'\b([A-Z]{2,5})[\'\s]s\b',      # AAPL's, MSFT's
            r'\(([A-Z]{2,5})\)',             # (AAPL)
            r'\b([A-Z]{2,5})\s+stock\b',     # AAPL stock
            r'\b([A-Z]{2,5})\s+shares?\b',   # AAPL shares
            r'\b([A-Z]{2,5})\s+ticker\b',    # AAPL ticker
        ]

        for pattern in ticker_patterns:
            matches = re.findall(pattern, query)
            for match in matches:
                if match in self.VALID_TICKERS and match not in self.EXCLUDED_WORDS:
                    found_tickers.add(match)

        # # 2. Fallback: use Finnhub symbol lookup
        # if not found_tickers:
        #     try:
        #         simple_query_lower = query_lower.replace("'s", "")
        #         finnhub_client = finnhub.Client(api_key=os.getenv("FINNHUB_API_KEY"))
        #         for i in simple_query_lower.split(" "):
        #             temp = finnhub_client.symbol_lookup(i)
        #             if temp["count"]>0 and i.upper() not in EXCLUDED_WORDS:
        #                 found_tickers.add(temp["result"][0]["symbol"])
        #     except Exception:
        #         pass  # Fail silently


        # 3. Determine comparison mode (more accurate)
        comparison_mode = False

        # Check for explicit comparison language
        for indicator in self.COMPARISON_INDICATORS:
            if re.search(indicator, query_lower):
                comparison_mode = True
                break
        ## Industry-specific fallbacks based on query context, default to MANGA
        if len(found_tickers) == 0:
            query_lower = query.lower()
            if any(word in query_lower for word in ['technology', 'tech']):
                found_tickers.update(["AAPL", "MSFT", "GOOGL", "META", "NVDA"])
            elif any(word in query_lower for word in ['financial', 'bank']):
                found_tickers.update(["JPM", "BAC", "WFC", "GS"])
            elif any(word in query_lower for word in ['across industries', 'all companies', 'major companies']):
                # Get all available companies from your database
                found_tickers.update(["AAPL", "MSFT", "GOOGL", "META", "NVDA", "TSLA", "AMZN", "NFLX",
                "JPM", "BAC", "WFC", "GS", "JNJ", "PG", "KO", "BRK-A"])
            else:
                # Default to your existing MANGA set
                found_tickers.update(["META", "AMZN", "NFLX", "GOOG", "AAPL"])

        # Check for multiple tickers
        if len(found_tickers) > 1:
            comparison_mode = True

        # Check for conjunction patterns with company names
        conjunction_pattern = r'\b(?:apple|microsoft|google|amazon|tesla|meta)\s+and\s+(?:apple|microsoft|google|amazon|tesla|meta)\b'
        if re.search(conjunction_pattern, query_lower):
            comparison_mode = True

        return TickerContext(
            tickers=frozenset(found_tickers),
            comparison_mode=comparison_mode
        )

class TemporalExtractor:
    """Fixed temporal extraction with better quarter detection"""

    def extract(self, query: str) -> TemporalContext:
        """Extract temporal context with improved parsing"""
        query_lower = query.lower()
        current_year = datetime.now().year

        # 1. Specific quarter patterns (fix the major bug!)
        quarter_patterns = [
            r'\bq([1-4])\s+(20\d{2})\b',           # Q1 2023
            r'\b([1-4])q\s+(20\d{2})\b',           # 1Q 2023
            r'\b(first|second|third|fourth)\s+quarter\s+(20\d{2})\b',  # first quarter 2023
            r'\b(q[1-4])\s+(20\d{2})\b',           # q1 2023
        ]

        for pattern in quarter_patterns:
            match = re.search(pattern, query_lower)
            if match:
                quarter_str = match.group(1)
                year = int(match.group(2))

                # Convert quarter to number
                if quarter_str.isdigit():
                    quarter = int(quarter_str)
                elif quarter_str.startswith('q'):
                    quarter = int(quarter_str[1])
                else:
                    quarter_map = {'first': 1, 'second': 2, 'third': 3, 'fourth': 4}
                    quarter = quarter_map.get(quarter_str, 1)

                return TemporalContext(
                    scope=TemporalScope.SPECIFIC_QUARTER,
                    specific_quarter=quarter,
                    specific_year=year
                )

        # 2. Specific year patterns
        year_patterns = [
            r'\b(20\d{2})\s+(?:annual|year|performance|results)\b',  # 2022 annual
            r'\b(?:annual|year|performance|results)\s+(20\d{2})\b',  # annual 2022
            r'\b(20\d{2})\s+(?:10-?k|annual\s+report)\b',          # 2022 10-K
        ]

        for pattern in year_patterns:
            match = re.search(pattern, query_lower)
            if match:
                year = int(match.group(1))
                return TemporalContext(
                    scope=TemporalScope.SPECIFIC_YEAR,
                    specific_year=year
                )

        # 3. Historical trend indicators
        trend_indicators = [
            r'\bover\s+time\b', r'\btrend\b', r'\bhistorical\b',
            r'\bchanged?\b', r'\bevolution\b', r'\bhow\s+has\b',
            r'\btracked\b', r'\bprogression\b', r'\bdeveloped\b'
        ]

        if any(re.search(indicator, query_lower) for indicator in trend_indicators):
            return TemporalContext(scope=TemporalScope.HISTORICAL_TREND)

        # 4. Recent indicators
        recent_indicators = [
            r'\brecent\b', r'\blatest\b', r'\bcurrent\b', r'\bnew\b',
            r'\blast\b', r'\bmost\s+recent\b', r'\bup\s+to\s+date\b'
        ]

        if any(re.search(indicator, query_lower) for indicator in recent_indicators):
            return TemporalContext(scope=TemporalScope.RECENT)

        # 5. Annual indicators
        annual_indicators = [
            r'\bannual\b', r'\byearly\b', r'\bper\s+year\b', r'\bannually\b'
        ]

        if any(re.search(indicator, query_lower) for indicator in annual_indicators):
            return TemporalContext(scope=TemporalScope.ANNUAL)

        # 6. Quarterly indicators (general)
        quarterly_indicators = [
            r'\bquarterly\b', r'\bquarter\b', r'\bq[1-4]\b'
        ]

        if any(re.search(indicator, query_lower) for indicator in quarterly_indicators):
            return TemporalContext(scope=TemporalScope.QUARTERLY)

        # Default to recent
        return TemporalContext(scope=TemporalScope.RECENT)

class DocumentExtractor:
    """Fixed document extraction with better pattern matching"""

    DOCUMENT_PATTERNS = {
        DocumentScope.FORM_10K: [
            r'\b10-?k\b', r'\bannual\s+report\b', r'\bannual\s+filing\b'
        ],
        DocumentScope.FORM_10Q: [
            r'\b10-?q\b', r'\bquarterly\s+report\b', r'\bquarterly\s+filing\b'
        ],
        DocumentScope.FORM_8K: [
            r'\b8-?k\b', r'\bcurrent\s+report\b', r'\bmaterial\s+event\b'
        ],
        DocumentScope.FORM_DEF14A: [
            r'\bdef\s*14a\b', r'\bproxy\s+statement\b', r'\bproxy\b'
        ],
        DocumentScope.FORM_3: [
            r'\bform\s+[3]\b', r'\binsider\s+trading\b', r'\binsider\s+transaction\b',
            r'\bownership\s+change\b', r'\bbeneficial\s+ownership\b'
        ],
        DocumentScope.FORM_4: [
            r'\bform\s+[4]\b', r'\binsider\s+trading\b', r'\binsider\s+transaction\b',
            r'\bownership\s+change\b', r'\bbeneficial\s+ownership\b'
        ],
        DocumentScope.FORM_5: [
            r'\bform\s+[5]\b', r'\binsider\s+trading\b', r'\binsider\s+transaction\b',
            r'\bownership\s+change\b', r'\bbeneficial\s+ownership\b'
        ],
    }

    def extract(self, query: str) -> DocumentContext:
        """Extract document context with improved patterns"""
        query_lower = query.lower()

        # Check each document type
        for doc_scope, patterns in self.DOCUMENT_PATTERNS.items():
            if any(re.search(pattern, query_lower) for pattern in patterns):
                return DocumentContext(scope=doc_scope)

        return DocumentContext(scope=DocumentScope.ANY)


class MultiDimensionalRouter:
    """Router with full ticker, temporal, and document awareness"""

    def __init__(self):
        self.base_router = CrossFormRouter()  # Your existing router
        self.ticker_extractor = TickerExtractor()
        self.temporal_extractor = TemporalExtractor()
        self.document_extractor = DocumentExtractor()

    def route_query(self, query: str, available_forms: Dict[str, Dict[str, List['FormChunk']]] = None) -> 'EnhancedRoutingResult':
        """
        Route query with full multi-dimensional awareness

        Args:
            query: User's natural language query
            available_forms: Dict[ticker][form_type] -> List[chunks]
                           e.g., {"AAPL": {"10-K": [chunks], "10-Q": [chunks]}}

        Returns:
            Enhanced routing result with context-aware recommendations
        """
        # Extract all context dimensions
        ticker_context = self.ticker_extractor.extract(query)
        temporal_context = self.temporal_extractor.extract(query)
        document_context = self.document_extractor.extract(query)

        query_context = QueryContext(
            ticker_context=ticker_context,
            temporal_context=temporal_context,
            document_context=document_context,
            original_query=query
        )

        # Get base routing from concept-based router
        base_result = self.base_router.route_question(query)

        # Apply multi-dimensional filtering and prioritization
        enhanced_targets = self._apply_contextual_filtering(
            base_result.primary_targets, query_context, available_forms
        )

        # Generate comprehensive reasoning
        reasoning = self._generate_contextual_reasoning(query_context, enhanced_targets)

        return EnhancedRoutingResult(
            query_context=query_context,
            primary_targets=enhanced_targets,
            base_confidence_scores=base_result.confidence_scores,
            reasoning=reasoning,
            recommended_forms=self._get_recommended_forms(query_context),
            recommended_tickers=list(ticker_context.tickers),
            temporal_strategy=self._get_temporal_strategy(temporal_context)
        )

    def _apply_contextual_filtering(self, base_targets: List[SectionIdentifier],
                                  context: QueryContext,
                                  available_forms: Optional[Dict] = None) -> List[SectionIdentifier]:
        """Apply multi-dimensional filtering to base targets"""

        filtered_targets = []

        for target in base_targets:
            # Document scope filtering
            if self._matches_document_scope(target.form_type, context.document_context):
                # Temporal preference adjustment
                temporal_score = self._get_temporal_score(target.form_type, context.temporal_context)

                if temporal_score > 0.3:  # Threshold for inclusion
                    filtered_targets.append(target)

        # Sort by contextual relevance
        filtered_targets.sort(
            key=lambda t: self._get_contextual_score(t, context),
            reverse=True
        )

        # Add fallback for unmatched queries
        if not base_targets or not filtered_targets:
            query_lower = context.original_query.lower()

            # Check for common concepts that might be missed
            concept_fallbacks = {
                ("r&d", "research", "development", "spending"): "research_development",
                ("risk", "factor", "threat", "challenge"): "risk_factors",
                ("revenue", "sales", "income", "changed", "over_time"): "revenue_analysis"
            }

            for keywords, concept_name in concept_fallbacks.items():
                if any(keyword in query_lower for keyword in keywords):
                    # Find the matching ConceptMapping and add its sections
                    matching_concept = next(
                        (cm for cm in self.concept_mappings if cm.name == concept_name),
                        None
                    )
                    if matching_concept:
                        base_targets.extend(list(matching_concept.sections))
                        break
            return base_targets

        return filtered_targets

    def _matches_document_scope(self, form_type: FormType, doc_context: DocumentContext) -> bool:
        """Check if form type matches document scope"""
        if doc_context.scope == DocumentScope.ANY:
            return True

        scope_mapping = {
            DocumentScope.FORM_10K: FormType.FORM_10K,
            DocumentScope.FORM_10Q: FormType.FORM_10Q,
            DocumentScope.FORM_8K: FormType.FORM_8K,
            DocumentScope.FORM_DEF14A: FormType.FORM_DEF14A,
        }

        if doc_context.scope in scope_mapping:
            return form_type == scope_mapping[doc_context.scope]

        if doc_context.scope == DocumentScope.FORMS_345:
            return form_type in [FormType.FORM_3, FormType.FORM_4, FormType.FORM_5]

        return True

    def _get_temporal_score(self, form_type: FormType, temporal_context: TemporalContext) -> float:
        """Calculate temporal alignment score"""
        scope = temporal_context.scope

        # Specific year/quarter preferences
        if scope in [TemporalScope.SPECIFIC_YEAR, TemporalScope.ANNUAL]:
            if form_type in [FormType.FORM_10K, FormType.FORM_DEF14A]:
                return 1.0
            elif form_type == FormType.FORM_10Q:
                return 0.7  # Quarterly can supplement annual
            else:
                return 0.5

        elif scope in [TemporalScope.SPECIFIC_QUARTER, TemporalScope.QUARTERLY]:
            if form_type == FormType.FORM_10Q:
                return 1.0
            elif form_type == FormType.FORM_8K:
                return 0.8  # 8-K can contain quarterly info
            else:
                return 0.4

        elif scope == TemporalScope.RECENT:
            if form_type in [FormType.FORM_8K, FormType.FORM_4]:
                return 1.0
            elif form_type == FormType.FORM_10Q:
                return 0.8
            else:
                return 0.6

        elif scope == TemporalScope.HISTORICAL_TREND:
            # For trends, we want multiple form types
            return 0.9  # All forms valuable for trends

        return 0.7  # Default score

    def _get_contextual_score(self, target: SectionIdentifier, context: QueryContext) -> float:
        """Calculate overall contextual relevance score"""
        base_score = 1.0

        # Temporal alignment
        temporal_score = self._get_temporal_score(target.form_type, context.temporal_context)

        # Document specificity bonus
        doc_specificity = 1.2 if context.document_context.scope != DocumentScope.ANY else 1.0

        # Comparison mode adjustments
        comparison_bonus = 1.1 if context.ticker_context.comparison_mode else 1.0

        return base_score * temporal_score * doc_specificity * comparison_bonus

    def _get_recommended_forms(self, context: QueryContext) -> List[FormType]:
        """Get recommended form types based on context"""
        if context.document_context.scope != DocumentScope.ANY:
            # Specific document requested
            scope_mapping = {
                DocumentScope.FORM_10K: [FormType.FORM_10K],
                DocumentScope.FORM_10Q: [FormType.FORM_10Q],
                DocumentScope.FORM_8K: [FormType.FORM_8K],
                DocumentScope.FORM_DEF14A: [FormType.FORM_DEF14A],
                DocumentScope.FORMS_345: [FormType.FORM_3, FormType.FORM_4, FormType.FORM_5],
            }
            return scope_mapping.get(context.document_context.scope, [])

        # Temporal-based recommendations
        scope = context.temporal_context.scope

        if scope == TemporalScope.HISTORICAL_TREND:
            return [FormType.FORM_10K, FormType.FORM_10Q, FormType.FORM_8K]  # Multiple for trends
        elif scope in [TemporalScope.SPECIFIC_YEAR, TemporalScope.ANNUAL]:
            return [FormType.FORM_10K, FormType.FORM_DEF14A]
        elif scope in [TemporalScope.SPECIFIC_QUARTER, TemporalScope.QUARTERLY]:
            return [FormType.FORM_10Q]
        elif scope == TemporalScope.RECENT:
            return [FormType.FORM_8K, FormType.FORM_4, FormType.FORM_10Q]

        return [FormType.FORM_10K, FormType.FORM_10Q, FormType.FORM_8K]  # Default

    def _get_temporal_strategy(self, temporal_context: TemporalContext) -> str:
        """Get human-readable temporal strategy"""
        scope = temporal_context.scope

        if scope == TemporalScope.SPECIFIC_YEAR:
            return f"Focus on {temporal_context.specific_year} annual filings"
        elif scope == TemporalScope.SPECIFIC_QUARTER:
            return f"Focus on Q{temporal_context.specific_quarter} {temporal_context.specific_year} quarterly filing"
        elif scope == TemporalScope.HISTORICAL_TREND:
            return "Analyze across multiple periods for trend analysis"
        elif scope == TemporalScope.RECENT:
            return "Prioritize most recent filings"
        elif scope == TemporalScope.ANNUAL:
            return "Focus on annual reports (10-K, DEF 14A)"
        elif scope == TemporalScope.QUARTERLY:
            return "Focus on quarterly reports (10-Q)"

        return "Standard temporal prioritization"

    def _generate_contextual_reasoning(self, context: QueryContext,
                                     targets: List[SectionIdentifier]) -> str:
        """Generate comprehensive reasoning including context"""
        reasoning_parts = []

        # Ticker context
        if context.ticker_context.tickers:
            tickers = ", ".join(sorted(context.ticker_context.tickers))
            if context.ticker_context.comparison_mode:
                reasoning_parts.append(f"Comparison analysis requested for: {tickers}")
            else:
                reasoning_parts.append(f"Analysis focused on: {tickers}")

        # Temporal context
        temporal_strategy = self._get_temporal_strategy(context.temporal_context)
        reasoning_parts.append(f"Temporal strategy: {temporal_strategy}")

        # Document context
        if context.document_context.scope != DocumentScope.ANY:
            reasoning_parts.append(f"Document scope: {context.document_context.scope.value}")

        # Target sections
        if targets:
            target_strs = [str(t) for t in targets[:3]]
            reasoning_parts.append(f"Primary sections: {', '.join(target_strs)}")

        return ". ".join(reasoning_parts)

@dataclass
class EnhancedRoutingResult:
    """Enhanced result with multi-dimensional context"""
    query_context: QueryContext
    primary_targets: List[SectionIdentifier]
    base_confidence_scores: Dict[SectionIdentifier, float]
    reasoning: str
    recommended_forms: List[FormType]
    recommended_tickers: List[str]
    temporal_strategy: str



#### Storage

In [None]:
import sys
import asyncio
import time
import logging
import json
import hashlib
import zlib
from dataclasses import dataclass, field
from contextlib import asynccontextmanager
import threading
from collections import defaultdict
import numpy as np
import asyncpg
import httpx
# from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

from abc import ABC, abstractmethod
from typing import Dict, List, Set, Optional, Protocol, Tuple, Union, Any
from enum import Enum
import calendar
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dateutil.parser import parse as dtparse
from datetime import date, datetime, timedelta
from functools import lru_cache

# from chunking import *
# from routing import *
# from utilities import *

from dotenv import load_dotenv
load_dotenv()


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

###############################################################################
# 1. TTL Cache (Fixes Memory Leak Issue)
###############################################################################

class TTLCache:
    """Thread-safe TTL cache that prevents memory leaks"""

    def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
        self.max_size = max_size
        self.ttl_seconds = ttl_seconds
        self._cache: Dict[str, Tuple[Any, float]] = {}
        self._lock = threading.RLock()

    def get(self, key: str) -> Optional[Any]:
        with self._lock:
            if key in self._cache:
                value, expiry = self._cache[key]
                if time.time() < expiry:
                    return value
                else:
                    del self._cache[key]
            return None

    def put(self, key: str, value: Any) -> None:
        with self._lock:
            # Clean expired entries
            current_time = time.time()
            expired_keys = [k for k, (_, expiry) in self._cache.items() if current_time >= expiry]
            for k in expired_keys:
                del self._cache[k]

            # Enforce size limit
            if len(self._cache) >= self.max_size:
                oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
                del self._cache[oldest_key]

            self._cache[key] = (value, current_time + self.ttl_seconds)

    def clear(self) -> None:
        with self._lock:
            self._cache.clear()

# Global caches (replaces problematic @lru_cache)
ticker_cache = TTLCache(max_size=500, ttl_seconds=86400)  # 24h TTL
embedding_cache = TTLCache(max_size=100, ttl_seconds=3600)  # 1h TTL
query_cache = TTLCache(max_size=200, ttl_seconds=1800)  # 30min TTL



###############################################################################
# 3. Production Database (Fixes Connection Pool Issues)
###############################################################################

class ProductionDB:
    """Production database with proper connection management"""

    def __init__(self):
        self.pool = None
        self.sqlite_conn = None
        self.provider = None

    async def initialize(self) -> bool:
        """Initialize with robust fallback chain"""

        # Try CockroachDB first (best free tier)
        cockroach_url = os.getenv('COCKROACH_DATABASE_URL')
        try:
            self.pool = await asyncpg.create_pool(
                cockroach_url,
                min_size=5,
                max_size=50,  # CockroachDB handles more connections
                command_timeout=30,
                max_inactive_connection_lifetime=300
            )
            await self._test_connection()
            self.provider = "cockroach"
            logger.info("✅ Connected to CockroachDB")
            await self._create_schema()
            return True
        except Exception as e:
            logger.warning(f"CockroachDB failed: {e}")


    async def _test_connection(self):
        """Test database connection"""
        async with self.pool.acquire() as conn:
            await conn.fetchval("SELECT 1")

    async def _create_schema(self):
        """Create optimized schema for your routing system"""
        schema = """
        CREATE TABLE IF NOT EXISTS chunks (
            chunk_id VARCHAR(255) PRIMARY KEY,
            form_type VARCHAR(50) NOT NULL,
            section_type VARCHAR(100) NOT NULL,
            section_number VARCHAR(50) NOT NULL,
            section_title TEXT NOT NULL,
            content TEXT NOT NULL,
            start_pos INTEGER NOT NULL,
            end_pos INTEGER NOT NULL,

            -- Your metadata structure preserved exactly
            cik VARCHAR(20) NOT NULL,
            ticker VARCHAR(10) NOT NULL,
            filing_date DATE,
            fiscal_year INTEGER,
            fiscal_quarter INTEGER,
            content_type VARCHAR(50),
            char_count INTEGER NOT NULL,

            filing_url TEXT,
            document_url TEXT,

            -- Attachment fields from your system
            parent_form_type VARCHAR(50),
            parent_filing_date DATE,
            attachment_number VARCHAR(50),
            attachment_description TEXT,
            is_attachment BOOLEAN DEFAULT FALSE,
            attachment_type VARCHAR(50),

            -- Vector storage (quantized)
            embedding BYTEA,
            created_at TIMESTAMP DEFAULT NOW()
        );

        -- Indexes optimized for your routing queries
        CREATE INDEX IF NOT EXISTS idx_routing_main ON chunks(ticker, form_type, fiscal_year DESC);
        CREATE INDEX IF NOT EXISTS idx_sections ON chunks(form_type, section_type, section_number);
        CREATE INDEX IF NOT EXISTS idx_temporal ON chunks(fiscal_year DESC, fiscal_quarter DESC);
        CREATE INDEX IF NOT EXISTS idx_content_type ON chunks(content_type);
        """
        async with self.pool.acquire() as conn:
            await conn.execute(schema)

    @asynccontextmanager
    async def get_connection(self):
        """Connection context manager with proper error handling"""
        try:
            async with self.pool.acquire() as conn:
                yield conn
        except Exception as e:
            logger.error(f"Database connection failed: {e}")
            raise

    # Add this helper function to your production code
    def convert_date_fields(self, chunk: FormChunk) -> tuple:
        """Convert string dates to proper date objects for asyncpg"""
        from datetime import datetime

        def parse_date_safely(date_str):
            if not date_str:
                return None
            if isinstance(date_str, str):
                try:
                    # Handle various date formats from SEC filings
                    if 'T' in date_str:
                        return datetime.fromisoformat(date_str.replace('Z', '+00:00')).date()
                    else:
                        # Try multiple date formats
                        for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%Y-%m-%d %H:%M:%S']:
                            try:
                                return datetime.strptime(date_str, fmt).date()
                            except ValueError:
                                continue
                        return None
                except (ValueError, TypeError):
                    return None
            return date_str


        # Convert the FormChunk date fields
        filing_date = parse_date_safely(chunk.filing_date)
        parent_filing_date = parse_date_safely(chunk.parent_filing_date)

        return filing_date, parent_filing_date


    async def store_chunks_batch(self, chunks: List[FormChunk], embeddings: List[bytes]) -> int:
        """Batch store using your FormChunk structure exactly"""
        if not chunks or len(chunks) != len(embeddings):
            return 0

        success_count = 0

        try:
            async with self.get_connection() as conn:
                # PostgreSQL batch insert
                for chunk, embedding in zip(chunks, embeddings):
                    # FIX: Convert dates properly
                    chunk.filing_date, chunk.parent_filing_date = self.convert_date_fields(chunk)
                    await conn.execute("""
                        INSERT INTO chunks (
                            chunk_id, form_type, section_type, section_number, section_title,
                            content, start_pos, end_pos, cik, ticker, filing_date, fiscal_year,
                            fiscal_quarter, content_type, char_count, filing_url, document_url,
                            parent_form_type, parent_filing_date, attachment_number,
                            attachment_description, is_attachment, attachment_type, embedding
                        ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24)
                        ON CONFLICT (chunk_id) DO UPDATE SET
                            embedding = EXCLUDED.embedding,
                            created_at = NOW()
                    """,
                    chunk.chunk_id, chunk.form_type, chunk.section_type,
                    chunk.section_number, chunk.section_title, chunk.content,
                    chunk.start_pos, chunk.end_pos, chunk.cik, chunk.ticker,
                    chunk.filing_date, chunk.fiscal_year, chunk.fiscal_quarter,
                    chunk.content_type, chunk.char_count, chunk.filing_url,
                    chunk.document_url, chunk.parent_form_type, chunk.parent_filing_date,
                    chunk.attachment_number, chunk.attachment_description,
                    chunk.is_attachment, chunk.attachment_type, embedding
                )
                success_count = len(chunks)

        except Exception as e:
            logger.error(f"Failed to store chunks: {e}")
            raise

        return success_count

    async def search_with_routing(self, routing_result: EnhancedRoutingResult, limit: int = 20) -> List[FormChunk]:
        """Search using YOUR routing intelligence - zero changes to your logic"""

        conditions = ["1=1"]
        params = []

        # Your ticker filtering logic exactly
        if routing_result.recommended_tickers:
            if self.provider == "sqlite":
                placeholders = ','.join('?' * len(routing_result.recommended_tickers))
                conditions.append(f"ticker IN ({placeholders})")
                params.extend(routing_result.recommended_tickers)
            else:
                conditions.append("ticker = ANY($" + str(len(params) + 1) + ")")
                params.append(routing_result.recommended_tickers)

        # Your form type filtering logic exactly
        if routing_result.recommended_forms:
            form_values = [f.value for f in routing_result.recommended_forms]
            if self.provider == "sqlite":
                placeholders = ','.join('?' * len(form_values))
                conditions.append(f"form_type IN ({placeholders})")
                params.extend(form_values)
            else:
                conditions.append("form_type = ANY($" + str(len(params) + 1) + ")")
                params.append(form_values)

        # Your temporal filtering logic exactly
        temporal_ctx = routing_result.query_context.temporal_context
        if temporal_ctx.specific_year:
            if self.provider == "sqlite":
                conditions.append("fiscal_year = ?")
                params.append(temporal_ctx.specific_year)
            else:
                conditions.append("fiscal_year = $" + str(len(params) + 1))
                params.append(temporal_ctx.specific_year)

        # Your section targeting logic exactly
        if routing_result.primary_targets:
            section_conditions = []
            for target in routing_result.primary_targets[:3]:
                section_conditions.append(
                    f"(form_type = '{target.form_type.value}' AND section_type = '{target.section_type}')"
                )
            if section_conditions:
                conditions.append(f"({' OR '.join(section_conditions)})")

        query = f"""
            SELECT chunk_id, form_type, section_type, section_number, section_title,
                   content, start_pos, end_pos, cik, ticker, filing_date, fiscal_year,
                   fiscal_quarter, content_type, char_count, filing_url, document_url,
                   parent_form_type, parent_filing_date, attachment_number,
                   attachment_description, is_attachment, attachment_type, embedding
            FROM chunks
            WHERE {' AND '.join(conditions)}
            ORDER BY fiscal_year DESC NULLS LAST, char_count DESC
            LIMIT {limit}
        """

        chunks = []
        try:
            async with self.get_connection() as conn:
                if self.provider == "sqlite":
                    async with conn.cursor() as cursor:
                        await cursor.execute(query, params)
                        rows = await cursor.fetchall()
                        column_names = [desc[0] for desc in cursor.description]
                        rows = [dict(zip(column_names, row)) for row in rows]
                else:
                    rows = await conn.fetch(query, *params)
                    rows = [dict(row) for row in rows]

                # Reconstruct your FormChunk objects exactly
                for row in rows:
                    chunk = FormChunk(
                        form_type=row['form_type'],
                        section_type=row['section_type'],
                        section_number=row['section_number'],
                        section_title=row['section_title'],
                        content=row['content'],
                        start_pos=row['start_pos'],
                        end_pos=row['end_pos'],
                        cik=row['cik'],
                        ticker=row['ticker'],
                        filing_date=row['filing_date'],
                        fiscal_year=row['fiscal_year'],
                        fiscal_quarter=row['fiscal_quarter'],
                        chunk_id=row['chunk_id'],
                        content_type=row['content_type'],
                        char_count=row['char_count'],
                        filing_url=row['filing_url'],
                        document_url=row['document_url'],
                        parent_form_type=row['parent_form_type'],
                        parent_filing_date=row['parent_filing_date'],
                        attachment_number=row['attachment_number'],
                        attachment_description=row['attachment_description'],
                        is_attachment=row['is_attachment'],
                        attachment_type=row['attachment_type']
                    )
                    chunks.append(chunk)

        except Exception as e:
            logger.error(f"Search failed: {e}")
            raise

        return chunks

###############################################################################
# 4. Memory-Efficient Vector Store (Fixes Memory Issues)
###############################################################################

class VectorStore:
    """Memory-efficient vector store with quantization"""

    def __init__(self):
        self.model = None
        self._device = "cpu"  # Conservative default

    def _get_model(self) -> SentenceTransformer:
        """Lazy load with memory management"""
        if self.model is None:
            self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
            # self.model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
            # self.model.eval()
        return self.model

    def _quantize_embedding(self, embedding: np.ndarray) -> bytes:
        """Quantize to int8 for 4x storage reduction"""
        normalized = embedding / (np.linalg.norm(embedding) + 1e-8)
        quantized = (normalized * 127).astype(np.int8)
        return zlib.compress(quantized.tobytes(), level=6)

    def _dequantize_embedding(self, compressed_bytes: bytes) -> np.ndarray:
        """Restore quantized embedding"""
        quantized_bytes = zlib.decompress(compressed_bytes)
        quantized = np.frombuffer(quantized_bytes, dtype=np.int8)
        return quantized.astype(np.float32) / 127.0

    async def embed_texts_batch(self, texts: List[str]) -> List[bytes]:
        """Batch embedding with caching"""
        embeddings = []

        for text in texts:
            # Check cache first
            cache_key = hashlib.md5(text.encode()).hexdigest()
            cached = embedding_cache.get(cache_key)

            if cached:
                embeddings.append(cached)
            else:
                # Generate embedding
                model = self._get_model()
                text_truncated = text[:2048]  # Prevent memory issues

                try:
                    loop = asyncio.get_event_loop()
                    # embedding = await loop.run_in_executor(
                    #     None,
                    #     lambda: model.embed_query(text_truncated)
                    # )
                    embedding = await loop.run_in_executor(
                        None,
                        lambda: model.encode([text_truncated], show_progress_bar=False)[0]
                    )
                    quantized = self._quantize_embedding(embedding)
                    embedding_cache.put(cache_key, quantized)
                    embeddings.append(quantized)
                except Exception as e:
                    logger.warning(f"Embedding failed for text: {e}")
                    # Fallback: zero embedding
                    zero_emb = np.zeros(384, dtype=np.float32)
                    quantized = self._quantize_embedding(zero_emb)
                    embeddings.append(quantized)

        return embeddings

    async def similarity_search(
        self,
        query_text: str,
        candidate_chunks: List[FormChunk],
        top_k: int = 10
    ) -> List[Tuple[FormChunk, float]]:
        """Semantic similarity ranking"""

        if not candidate_chunks:
            return []

        # Get query embedding
        query_embeddings = await self.embed_texts_batch([query_text])
        query_vec = self._dequantize_embedding(query_embeddings[0])

        # Score candidates
        scored_chunks = []

        for chunk in candidate_chunks:
            # Simple TF-IDF-like scoring as fallback if no embeddings
            query_words = set(query_text.lower().split())
            content_words = set(chunk.content.lower().split())
            overlap = len(query_words.intersection(content_words))

            # Boost based on your content type system
            type_boost = 1.0
            if chunk.content_type in ['risk_factors', 'mda', 'business_overview']:
                type_boost = 1.2

            score = (overlap / max(len(query_words), 1)) * type_boost
            scored_chunks.append((chunk, score))

        # Return top-k
        scored_chunks.sort(key=lambda x: x[1], reverse=True)
        return scored_chunks[:top_k]


###############################################################################
# 6. Production Ingestion Pipeline (Uses Your Logic Exactly)
###############################################################################

class IngestionPipeline:
    """Production ingestion using YOUR chunking logic exactly"""

    def __init__(self, db: ProductionDB, vector_store: VectorStore):
        self.db = db
        self.vector_store = vector_store

    async def ingest_company(
        self,
        ticker: str,
        forms: List[str] = ["10-K", "10-Q", "8-K"],
        limit: int = 5
    ) -> Dict[str, Any]:
        """Ingest company using YOUR exact chunking and metadata logic"""

        start_time = time.time()
        result = {
            "ticker": ticker,
            "chunks_processed": 0,
            "filings_processed": 0,
            "errors": []
        }
        # sec_limiter = SECRateLimiter()
        try:
            company = Company(ticker)
            filings = company.get_filings(form=forms).head(5)

            logger.info(f"Processing {len(filings)} filings for {ticker}")

            for filing in filings:
                try:
                    # await sec_limiter.acquire()  # Rate limit each request

                    # Get filing text
                    main_text = filing.text()

                    if len(main_text.strip()) < 10:
                        continue

                    # Use YOUR metadata extraction exactly
                    base_meta = build_base_metadata(filing, main_text)

                    # Use YOUR chunking logic exactly
                    main_meta = base_meta.copy()
                    main_meta.pop('form_type', None)
                    main_chunks = chunk_form(main_text, filing.form, **main_meta)

                    # Use YOUR attachment processing exactly
                    attachment_chunks = process_filing_attachments(filing, base_meta)

                    all_chunks = main_chunks + attachment_chunks

                    # Generate embeddings
                    chunk_texts = [chunk.content[:2048] for chunk in all_chunks]
                    embeddings = await self.vector_store.embed_texts_batch(chunk_texts)

                    # Store in database
                    stored_count = await self.db.store_chunks_batch(all_chunks, embeddings)

                    result["chunks_processed"] += stored_count
                    result["filings_processed"] += 1

                    logger.info(f"Processed {filing.form} for {ticker}: {stored_count} chunks")

                except Exception as e:
                    result["errors"].append(f"Failed to process {filing.form}: {str(e)}")
                    continue

        except Exception as e:
            result["errors"].append(f"Failed to get filings for {ticker}: {str(e)}")

        result["processing_time"] = time.time() - start_time
        return result


#### Pipeline

In [None]:
import sys
import asyncio
import time
import logging
import json
import hashlib
import zlib
from dataclasses import dataclass, field
from contextlib import asynccontextmanager
import threading
from collections import defaultdict
import numpy as np
import asyncpg
import httpx

from abc import ABC, abstractmethod
from typing import Dict, List, Set, Optional, Protocol, Tuple, Union, Any
from enum import Enum
import calendar
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dateutil.parser import parse as dtparse
from datetime import date, datetime, timedelta
from functools import lru_cache

# from chunking import *
# from routing import *
# from utilities import *
# from storage import *

# from groq import AsyncGroq
from google import genai

from dotenv import load_dotenv
load_dotenv()


@dataclass
class LLMResponse:
    text: str
    provider: str
    success: bool = True

###############################################################################
# 7. Complete QA Engine (Uses Your Router Exactly)
###############################################################################

@dataclass
class QAResult:
    answer: str
    sources: List[Dict[str, Any]]
    confidence: float
    routing_info: Dict[str, Any]
    provider_used: str
    processing_time: float


class LLMProvider:
    """Rate-limited LLM provider with fallback"""

    def __init__(self):
        # self.groq_client = AsyncGroq()
        self.gemini_client = genai.Client()
        self._last_request = {}
        # self._min_interval = {"groq": 2.0}  # Conservative intervals

    # async def _rate_limit(self, provider: str):
    #     """Simple rate limiting"""
    #     if provider in self._last_request:
    #         elapsed = time.time() - self._last_request[provider]
    #         min_interval = self._min_interval.get(provider, 1.0)
    #         if elapsed < min_interval:
    #             await asyncio.sleep(min_interval - elapsed)

    #     self._last_request[provider] = time.time()

    async def generate(self, messages: List[Dict[str, str]]) -> LLMResponse:
        """Generate with fallback chain"""

        # groq_key = os.getenv('GROQ_API_KEY')
        gemini_key = os.getenv("GEMINI_API_KEY")
        if not gemini_key:
          return LLMResponse(
              text="No API key configured for LLM provider.",
              provider="error",
              success=False
          )
        # if groq_key:
        if gemini_key:
            try:
              # completion = await self.groq_client.chat.completions.create(
              #     model="llama-3.3-70b-versatile",
              #     messages=messages,
              #     temperature=0.1,
              #     max_completion_tokens=1024
              # )

                #  Extract messages by role
                system_msgs = [msg["content"] for msg in messages if msg["role"] == "system"]
                user_msgs = [msg["content"] for msg in messages if msg["role"] == "user"]

                # Combine content
                system_part = "\n".join(system_msgs) if system_msgs else ""
                user_part = "\n".join(user_msgs) if user_msgs else ""
                final_content = f"{system_part}\n\n{user_part}" if system_part else user_part

                response = self.gemini_client.models.generate_content(
                    model="gemini-2.5-flash",
                    contents=final_content
                )
                return LLMResponse(
                  # text=completion.choices[0].message.content,
                  text = response.text,
                  provider="gemini",
                )

            except Exception as e:
                    # logger.warning(f"Groq failed: {e}")
                    logger.warning(f"Gemini failed: {e}")

        # Extractive fallback
        return LLMResponse(
            text="I apologize, but I'm experiencing technical difficulties with LLM providers. Please try again later.",
            provider="fallback",
            success=False
        )


class SECQASystem:
    """Main application class - production ready"""

    def __init__(self):
        self.db = ProductionDB()
        self.vector_store = VectorStore()
        self.llm = LLMProvider()
        self.ingestion = None
        self.qa = None
        self.initialized = False

    async def initialize(self) -> bool:
        """Initialize all components"""
        try:
            # Initialize database
            if not await self.db.initialize():
                logger.error("Database initialization failed")
                return False

            # Initialize other components
            self.ingestion = IngestionPipeline(self.db, self.vector_store)
            self.qa = QAEngine(self.db, self.vector_store, self.llm)

            self.initialized = True
            logger.info("✅ SEC QA System initialized successfully")
            return True

        except Exception as e:
            logger.error(f"Initialization failed: {e}")
            return False

    def _check_initialized(self):
        """Check if system is initialized"""
        if not self.initialized:
            raise RuntimeError("System not initialized. Call initialize() first.")

    async def ingest_companies(
        self,
        tickers: List[str],
        forms: List[str] = ["10-K", "10-Q", "8-K"],
        limit_per_company: int = 5
    ) -> Dict[str, Any]:
        """Ingest multiple companies"""
        self._check_initialized()

        results = {}

        for ticker in tickers:
            try:
                result = await self.ingestion.ingest_company(ticker, forms, limit_per_company)
                results[ticker] = result

                logger.info(f"Ingested {ticker}: {result['chunks_processed']} chunks")

                # Brief pause between companies
                await asyncio.sleep(1)

            except Exception as e:
                results[ticker] = {
                    "ticker": ticker,
                    "chunks_processed": 0,
                    "filings_processed": 0,
                    "errors": [f"Failed to process {ticker}: {str(e)}"],
                    "processing_time": 0
                }

        total_chunks = sum(r.get('chunks_processed', 0) for r in results.values())
        logger.info(f"✅ Batch ingestion complete: {total_chunks} total chunks")

        return results

    async def ask_question(self, question: str, max_sources: int = 8) -> QAResult:
        """Ask a question using your complete routing intelligence"""
        self._check_initialized()
        return await self.qa.ask(question, max_sources)

    async def get_status(self) -> Dict[str, Any]:
        """Get system status"""
        self._check_initialized()

        try:
            async with self.db.get_connection() as conn:
                chunk_count = await conn.fetchval("SELECT COUNT(*) FROM chunks")
                ticker_count = await conn.fetchval("SELECT COUNT(DISTINCT ticker) FROM chunks")

            return {
                "status": "healthy",
                "database_provider": self.db.provider,
                "chunks_stored": chunk_count,
                "companies_indexed": ticker_count,
                "cache_stats": {
                    "ticker_cache_size": len(ticker_cache._cache),
                    "embedding_cache_size": len(embedding_cache._cache),
                    "query_cache_size": len(query_cache._cache)
                },
                "features": [
                    "Your FormChunk chunking with full metadata",
                    "Your MultiDimensionalRouter with context awareness",
                    "TTL caching to prevent memory leaks",
                    "SEC rate limiting",
                    "Quantized embeddings for efficiency",
                    "Multi-provider LLM fallback"
                ]
            }
        except Exception as e:
            return {
                "status": "error",
                "error": str(e)
            }


class QAEngine:
    """Complete QA engine built on YOUR routing intelligence"""

    def __init__(self, db: ProductionDB, vector_store: VectorStore, llm: LLMProvider):
        self.db = db
        self.vector_store = vector_store
        self.llm = llm
        # Use YOUR router exactly - zero modifications
        self.router = MultiDimensionalRouter()

    # ADD this to the existing QAEngine class
    async def smart_ingest_for_query(self, question: str) -> Dict[str, Any]:
        """Use YOUR router to determine what data to fetch"""

        # 1. Use your existing router
        routing_result = self.router.route_query(question)

        # 2. Check what temporal data your router identified as needed
        temporal_ctx = routing_result.query_context.temporal_context
        tickers = routing_result.recommended_tickers
        forms = [f.value for f in routing_result.recommended_forms]

        logger.info(f"🎯 Router identified need for: {tickers}, {forms}, {temporal_ctx.scope.value}")

        # 3. Check existing data in your CockroachDB
        async with self.db.pool.acquire() as conn:
            existing_data = await conn.fetch("""
                SELECT ticker, fiscal_year, COUNT(*) as chunk_count
                FROM chunks
                WHERE ticker = ANY($1) AND form_type = ANY($2)
                GROUP BY ticker, fiscal_year
                ORDER BY fiscal_year DESC
            """, tickers, forms)

        # 4. Determine what years need more data based on router intelligence
        needed_years = set()

        if temporal_ctx.specific_year:
            needed_years.add(temporal_ctx.specific_year)
        elif temporal_ctx.scope.value == "historical":
            # Router wants historical analysis - need multiple years
            needed_years.update([2020, 2021, 2022, 2023, 2024])
        else:
            # Recent data
            needed_years.update([2023, 2024])

        # 5. Fetch targeted filings based on router needs
        total_new_chunks = 0

        for ticker in tickers:
            try:
                # await self._rate_limit_sec()
                company = Company(ticker)

                # Get filings for the years your router identified
                for year in needed_years:
                    try:
                        # Fetch specific year data based on router intelligence
                        year_filings = company.get_filings(
                            form=forms,
                            filing_date=f"{year}-01-01:{year}-12-31"
                        )  # Conservative limit per year

                        for filing in year_filings:
                            try:
                                # await self._rate_limit_sec()
                                main_text = filing.text()

                                if len(main_text.strip()) < 100:
                                    continue

                                # Use YOUR existing chunking logic exactly
                                base_meta = build_base_metadata(filing, main_text)
                                main_meta = base_meta.copy()
                                main_meta.pop('form_type', None)
                                main_chunks = chunk_form(main_text, filing.form, **main_meta)
                                attachment_chunks = process_filing_attachments(filing, base_meta)

                                all_chunks = main_chunks + attachment_chunks

                                # Generate embeddings and store using your existing methods
                                chunk_texts = [chunk.content for chunk in all_chunks]
                                embeddings = self.vector_store.embed_texts(chunk_texts)

                                # Use your existing storage method (now with date fix)
                                stored_count = await self.db.store_chunks_batch(all_chunks, embeddings)
                                total_new_chunks += stored_count

                                logger.info(f"📊 {ticker} {year}: +{stored_count} chunks")

                            except Exception as e:
                                logger.error(f"Error processing {ticker} {year} {filing.form}: {e}")
                                continue

                    except Exception as e:
                        logger.warning(f"No {year} data for {ticker}: {e}")
                        continue

            except Exception as e:
                logger.error(f"Error fetching {ticker}: {e}")
                continue

        return {
            "new_chunks": total_new_chunks,
            "router_reasoning": routing_result.reasoning,
            "temporal_strategy": temporal_ctx.scope.value
        }


    async def ask(self, question: str, max_sources: int = 8) -> QAResult:
        """End-to-end QA using YOUR routing logic exactly"""

        start_time = time.time()

        # Check cache first
        cache_key = hashlib.md5(question.encode()).hexdigest()
        cached_result = query_cache.get(cache_key)
        if cached_result:
            logger.info(f"Cache hit for question: {question[:50]}...")
            return cached_result

        logger.info(f"Processing question: {question}")

        # 1. Use YOUR routing logic exactly - zero changes
        routing_result = self.router.route_query(question)

        logger.info(f"Routing: {routing_result.reasoning}")

        if not routing_result.primary_targets:
            return QAResult(
                answer="I couldn't identify specific sections relevant to your question. Please try rephrasing.",
                sources=[],
                confidence=0.0,
                routing_info={"reasoning": routing_result.reasoning},
                provider_used="none",
                processing_time=time.time() - start_time
            )

        # 2. Search database using YOUR routing result exactly
        try:
            candidate_chunks = await self.db.search_with_routing(routing_result, limit=max_sources * 2)
        except Exception as e:
            logger.error(f"Database search failed: {e}")
            return QAResult(
                answer="I experienced a technical issue accessing the database. Please try again.",
                sources=[],
                confidence=0.2,
                routing_info={"error": str(e)},
                provider_used="none",
                processing_time=time.time() - start_time
            )

        # 2.5: If insufficient data, use router intelligence to fetch more
        if len(candidate_chunks) < max_sources // 2:  # Less than half needed
            logger.info("🎯 Insufficient data - using router to fetch targeted data")
            ingest_result = await self.smart_ingest_for_query(question)
            logger.info(f"✅ Router-driven ingestion: +{ingest_result['new_chunks']} chunks")

            candidate_chunks = await self.db.search_with_routing(routing_result, limit=max_sources * 2)

        if not candidate_chunks:
            return QAResult(
                answer="I couldn't find relevant information in the database for your question.",
                sources=[],
                confidence=0.3,
                routing_info={"reasoning": routing_result.reasoning},
                provider_used="none",
                processing_time=time.time() - start_time
            )

        logger.info(f"Found {len(candidate_chunks)} candidate chunks")

        # 3. Semantic ranking
        try:
            ranked_chunks = await self.vector_store.similarity_search(
                question,
                candidate_chunks,
                top_k=max_sources
            )
        except Exception as e:
            logger.warning(f"Semantic search failed: {e}")
            # Fallback to routing order
            ranked_chunks = [(chunk, 0.5) for chunk in candidate_chunks[:max_sources]]

        if not ranked_chunks:
            return QAResult(
                answer="I found some potentially relevant information but couldn't rank it properly.",
                sources=[],
                confidence=0.3,
                routing_info={"reasoning": routing_result.reasoning},
                provider_used="none",
                processing_time=time.time() - start_time
            )

        # 4. Build context from YOUR FormChunk structure exactly
        context_parts = []
        sources = []

        for i, (chunk, score) in enumerate(ranked_chunks, 1):
            # Use YOUR complete metadata structure
            context_header = f"[{i}] {chunk.ticker} {chunk.form_type}"

            if chunk.fiscal_year:
                context_header += f" (FY{chunk.fiscal_year}"
                if chunk.fiscal_quarter:
                    context_header += f" Q{chunk.fiscal_quarter}"
                context_header += ")"

            context_header += f" - {chunk.section_title}"

            # Add content with reasonable limit
            # content_preview = chunk.content[:1000] + "..." if len(chunk.content) > 1000 else chunk.content
            content_preview = chunk.content
            context_parts.append(f"{context_header}:\n{content_preview}")

            # Build source info from YOUR metadata exactly
            sources.append({
                "id": str(i),
                "ticker": chunk.ticker,
                "form_type": chunk.form_type,
                "fiscal_year": chunk.fiscal_year,
                "fiscal_quarter": chunk.fiscal_quarter,
                "section": f"{chunk.section_type} {chunk.section_number}".strip(),
                "section_title": chunk.section_title,
                "filing_date": chunk.filing_date,
                "content_type": chunk.content_type,
                "document_url": chunk.document_url or chunk.filing_url,
                "is_attachment": chunk.is_attachment,
                "similarity_score": score
            })

        context = "\n\n".join(context_parts)

        # 5. Generate answer
        messages = [
            {
                "role": "system",
                "content": (
                    "You are a financial research assistant analyzing SEC filings. "
                    "Provide accurate, detailed answers based strictly on the provided context. "
                    "Always cite sources using [1], [2], etc. numbers. "
                    "Be specific about which filing and section information comes from. "
                    "If information is incomplete, clearly state what's missing."
                )
            },
            {
                "role": "user",
                "content": f"Question: {question}\n\nContext:\n{context}\n\nAnswer with citations:"
            }
        ]

        try:
            llm_response = await self.llm.generate(messages)

            # Calculate confidence
            confidence = 0.8 if llm_response.success else 0.4
            if len(ranked_chunks) >= 3:
                confidence *= 1.1
            if routing_result.recommended_tickers:
                confidence *= 1.1

            confidence = min(confidence, 1.0)

        except Exception as e:
            logger.error(f"LLM generation failed: {e}")
            llm_response = LLMResponse(
                text="I found relevant information but couldn't generate a proper response due to technical issues.",
                provider="error",
                success=False
            )
            confidence = 0.3

        result = QAResult(
            answer=llm_response.text,
            sources=sources,
            confidence=confidence,
            routing_info={
                "reasoning": routing_result.reasoning,
                "recommended_tickers": routing_result.recommended_tickers,
                "recommended_forms": [f.value for f in routing_result.recommended_forms],
                "temporal_strategy": routing_result.temporal_strategy
            },
            provider_used=llm_response.provider,
            processing_time=time.time() - start_time
        )

        # Cache result
        query_cache.put(cache_key, result)

        logger.info(f"Generated answer using {llm_response.provider} in {result.processing_time:.1f}s")

        return result


#### ..

####..

In [12]:
!python3 -V

os.environ['EDGAR_IDENTITY'] = 'CompanyName CompanyEmail'
os.environ['GEMINI_API_KEY'] = 'GEMINI_API_KEY'
os.environ['COCKROACH_DATABASE_URL'] = 'COCKROACH_DATABASE_URL'


Python 3.11.13


In [None]:
import sys
import asyncio
import time
import logging
import json
import hashlib
import zlib
from dataclasses import dataclass, field
from contextlib import asynccontextmanager
import threading
from collections import defaultdict
import numpy as np
import asyncpg
import httpx

from abc import ABC, abstractmethod
from typing import Dict, List, Set, Optional, Protocol, Tuple, Union, Any
from enum import Enum
import calendar
import re
import json
from flask import Flask, request, jsonify
import torch
import pandas as pd
import os
import jsonpickle
from dateutil.parser import parse as dtparse
from datetime import date, datetime, timedelta
from functools import lru_cache

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# from chunking import *
# from routing import *
# from utilities import *
# from storage import *
# from pipeline import *
# from edgar import *

from dotenv import load_dotenv
load_dotenv()

###############################################################################
# 9. Demo Script
###############################################################################

async def demo_system():
    """Demo the production system"""

    # Initialize system
    system = SECQASystem()

    if not await system.initialize():
        print("❌ System initialization failed")
        return

    print("🚀 SEC QA System initialized successfully!")

    load_dotenv() # This loads the variables from .env

    api_key = os.getenv("API_KEY")
    database_url = os.getenv("DATABASE_URL")


    # Ingest a few companies (small batch for demo)
    print("\n📊 Ingesting sample companies...")
    # companies = ["AAPL", "MSFT", "TSLA"]
    companies = ["AAPL"]

    results = await system.ingest_companies(companies, limit_per_company=5)

    for ticker, result in results.items():
        if result['errors']:
            print(f"  ⚠️ {ticker}: {len(result['errors'])} errors")
            for error in result['errors']:
                print(f"    - {error}")
        else:
            print(f"  ✅ {ticker}: {result['chunks_processed']} chunks in {result['processing_time']:.1f}s")

    # Demo questions using your routing intelligence
    print("\n❓ Demo questions:")

    test_questions = [
        "What are Apple's main risk factors?",
        "What is the trend in Apple's recent revenue ?",
        # "Compare Apple and Microsoft's R&D spending"
    ]

    for question in test_questions:
        print(f"\n🤔 {question}")
        try:
            result = await system.ask_question(question, max_sources=5)
            print(f"   📈 Confidence: {result.confidence:.2f}")
            print(f"   🎯 Routing: {result.routing_info['reasoning']}")
            print(f"   📚 Sources: ({len(result.sources)}) sources: {[[i+1, r['document_url']] for i, r in enumerate(result.sources)]}")
            print(f"   🤖 Provider: {result.provider_used}")
            print(f"   ⏱️ Time: {result.processing_time:.1f}s")
            print(f"   💬 Answer: {result.answer}...")

        except Exception as e:
            print(f"   ❌ Error: {e}")

    # Show system status
    print("\n📊 System Status:")
    status = await system.get_status()
    for key, value in status.items():
        if key != "features":
            print(f"   {key}: {value}")

    print("\n✅ Demo complete!")

required_vars = ["EDGAR_IDENTITY", "GEMINI_API_KEY", "COCKROACH_DATABASE_URL"]
missing = [var for var in required_vars if not os.getenv(var)]

if missing:
    print(f"❌ Missing required environment variables: {missing}")
    print("Set them like:")
    for var in missing:
        if var == "EDGAR_IDENTITY":
            print(f'export {var}="YourName your.email@domain.com"')
        else:
            print(f'export {var}="your_api_key_here"')
    sys.exit(1)

await demo_system()



🚀 SEC QA System initialized successfully!

📊 Ingesting sample companies...


  ✅ AAPL: 84 chunks in 28.1s

❓ Demo questions:

🤔 What are Apple's main risk factors?
   📈 Confidence: 0.97
   🎯 Routing: Analysis focused on: AAPL. Temporal strategy: Prioritize most recent filings. Primary sections: 8-K ITEM 8.01, 10-Q ITEM 1A, 10-K ITEM 1A
   📚 Sources: (5) sources: [[1, 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019325000073/aapl-20250628.htm'], [2, 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019325000057/aapl-20250329.htm'], [3, 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019325000057/aapl-20250329.htm'], [4, 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000114036125027340/ef20052355_8k.htm'], [5, 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000114036125027340/ef20052355_8k.htm']]
   🤖 Provider: gemini
   ⏱️ Time: 15.7s
   💬 Answer: The provided context indicates that a comprehensive discussion of Apple's risk factors is located in Part I, Item 1A of the Annual Report on Form 10-K for the