In [1]:
# imports
import datetime
import textwrap
from pathlib import Path

# packages
from IPython.display import HTML, display

# kelvin imports
from kelvin.nlp.llm.engines.openai_engine import OpenAIEngine
from kelvin.nlp.llm.summarize.recursive_split_summarizer import RecursiveSplitSummarizer, ChunkUnit
from kelvin.nlp.llm.summarize.text_memoizer import TextMemoizer
from kelvin.ocr.tesseract.local_engine import LocalEngine
from kelvin.research.edgar.feed.updater import EDGARFeedUpdater
from kelvin.research.edgar.filing.parser import FilingParser, uudecode

In [2]:
# We'll use two LLMs for this example: GPT-3.5 and GPT-4
llm_gpt35 = OpenAIEngine(model="gpt-3.5-turbo")
llm_gpt4 = OpenAIEngine(model="gpt-4")

# Setup a default summarizer with 3.5
summarizer = RecursiveSplitSummarizer(engine=llm_gpt35,
                                      chunk_unit=ChunkUnit.SECTIONS,
                                      summary_token_length=300)

In [3]:
# Kelvin supports multiple OCR engines, including both local (tesseract, PaddlePaddle) and remote (AWS, Azure, Kelvin) engines
ocr_engine = LocalEngine()

In [4]:
# for recurring use cases, you can keep a local cache of EDGAR feed files to dramatically reduce search/processing time.
# download filings for this quarter
# edgar_feed.download_all_filings(2022, 1)

In [5]:
# Setup an EDGAR feed class
edgar_feed = EDGARFeedUpdater()

# iterate through all filing
for file_name, filing_buffer in edgar_feed.stream_all_filings(year=2022, quarter=1):
    # parse the filing into 1+ documents
    filing = FilingParser(filing_buffer)
    
    # get filer/company name for display
    filer_name = filing.metadata.get("CONFORMED-NAME", filing.metadata.get("COMPANY-NAME", None))
    form_type = filing.metadata.get("TYPE", filing.metadata.get("FORM-TYPE", None))
    filing_date = filing.metadata.get("FILING-DATE", None)

    display(HTML(f"<h2>{filer_name}: {form_type} {filing_date}</h2>"))

    for doc in filing.stream():
        # header for each document/exhibit
        doc_sequence = doc["metadata"].get("SEQUENCE", None)
        doc_description = doc["metadata"].get("DESCRIPTION", None)
        display(HTML(f"<h3>{doc_description} (#{doc_sequence})</h3>"))
        
        if doc_description in ["GRAPHIC"]:
            summary = ocr_engine.process_sync(uudecode(doc['raw'])).decode('utf-8')
        else:
            # get summary
            summary = summarizer.get_summary(
                doc['text'],
                context_type=f"EDGAR {form_type} filing",
            )
        
        display(HTML(f"<pre>{textwrap.fill(summary, 80)}</pre>"))
        

    # stop after the first filing
    break