In [23]:
from __future__ import annotations
import sec_parser as sp
import sec_downloader as sd
import jupyter_black
import tiktoken

jupyter_black.load()

In [2]:
# Download HTML
dl = sd.Downloader("alphanome.ai", "info@alphanome.ai")
html = dl.get_latest_html("10-Q", "AAPL")

In [3]:
# Parse HTML
elements = sp.Edgar10QParser().parse(html)
top_level_sections = [
    item for part in sp.TreeBuilder().build(elements) for item in part.children
]

In [4]:
# Filter MD&A section
mdna_top_level_sections = [
    k for k in top_level_sections if "management" in k.semantic_element.text.lower()
]
assert len(mdna_top_level_sections) == 1
mdna_top_level_section = mdna_top_level_sections[0]

In [5]:
# Convert to markdown (Step 1: Get levels)
levels = sorted(
    {
        k.semantic_element.level
        for k in mdna_top_level_section.get_descendants()
        if isinstance(k.semantic_element, sp.TitleElement)
    }
)
level_to_markdown = {level: "#" * (i + 2) for i, level in enumerate(levels)}
level_to_markdown

{3: '##', 4: '###'}

In [6]:
# Convert to markdown (Step 2: Extract text)
markdown = ""
markdown += f"# {mdna_top_level_section.semantic_element.text}\n"
for node in mdna_top_level_section.get_descendants():
    element = node.semantic_element
    if isinstance(element, sp.TextElement):
        markdown += f"{element.text}\n"
    elif isinstance(element, sp.TitleElement):
        markdown += f"{level_to_markdown[element.level]} {element.text}\n"
    elif isinstance(element, sp.TableElement):
        markdown += f"[{element.get_summary()}]\n"

In [21]:
def get_lines(text, start=None, end=None, max_line_length=80):
    lines = text.split("\n")[start:end]
    return "\n".join(
        line if len(line) <= max_line_length else line[:max_line_length] + "..."
        for line in lines
    )


print(get_lines(markdown, end=13))
print("...")
print(get_lines(markdown, start=-13))

# Item 2.    Management’s Discussion and Analysis of Financial Condition and Res...
## Available Information
The Company periodically provides certain information for investors on its corpo...
## Business Seasonality and Product Introductions
The Company has historically experienced higher net sales in its first quarter c...
## Fiscal Period
The Company’s fiscal year is the 52- or 53-week period that ends on the last Sat...
## Quarterly Highlights
Weakness in foreign currencies relative to the U.S. dollar had an unfavorable im...
### Macroeconomic Conditions
Macroeconomic conditions, including inflation, changes in interest rates, and cu...
## Segment Operating Performance
The following table shows net sales by reportable segment for the three- and nin...
...
## Provision for Income Taxes
Provision for income taxes, effective tax rate and statutory federal income tax ...
[Table with 3 rows, 16 numbers, and 228 characters.]
The Company’s effective tax rate for the third quarter and firs

In [26]:
GPT4_ENCODING = "cl100k_base"


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


num_tokens_from_string(markdown, GPT4_ENCODING)

2834