In [1]:
# =============================================================================
# Cell 1 — Imports & Configuration
# =============================================================================

import os
import re
import torch
import chromadb
from dotenv import load_dotenv
from tqdm import tqdm
from edgar import *
from doc2dict import html2dict, unnest_dict
from sentence_transformers import SentenceTransformer

# Load environment variables from .env
load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HUGGING_FACE_TOKEN", "")

# Set SEC EDGAR identity
# SEC EDGAR requires an identity string (name + email).
# Read from edgar-identity.txt (line 1: name, line 2: email) to avoid
# committing credentials to version control.
identity_name = os.getenv("EDGAR_IDENTITY_NAME")
identity_email = os.getenv("EDGAR_IDENTITY_EMAIL")

if identity_name and identity_email:
    print(f"Setting SEC EDGAR identity: {identity_name}, {identity_email}")
    set_identity(f"{identity_name} {identity_email}")
else:
    print("Warning: SEC EDGAR identity not set. Please provide name and email in environment variables.")

# ----- Constants -----
EMBEDDING_MODEL_NAME = "google/embeddinggemma-300m"
CHUNK_TOKEN_LIMIT = 500     # Hard upper bound (tokens ≈ whitespace-split words)
CHUNK_TOLERANCE = 50        # Soft target: finalise a chunk once it reaches 450-550 tokens
TOP_K = 5                   # Number of results to return

# ----- Device -----
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Setting SEC EDGAR identity: Michael Mccallum, mike.mccalum@indigo.com
Using device: cuda
GPU: NVIDIA GeForce GTX 1650


In [None]:
# =============================================================================
# Cell 2 — Fetch SEC Filing
# =============================================================================

TICKER = "INTC"
FORM_TYPE = "10-K"

company = Company(TICKER)
filings = company.get_filings(form=FORM_TYPE)

# Retrieve the most recent filing
filing = filings[0]
print(f"Filing: {filing}")
print(f"Filed: {filing.filing_date}")

# Download the HTML content
html_content = filing.html()
print(f"\nHTML length: {len(html_content):,} characters")

Filing: Filing(company='INTEL CORP', cik=50863, form='10-K', filing_date='2026-01-23', accession_no='0000050863-26-000011')
Filed: 2026-01-23

HTML length: 3,320,720 characters


In [3]:
# =============================================================================
# Cell 3 — Parse HTML to Dictionary & Extract Segments
# =============================================================================

def extract_segments(dct, path="", segments=None):
    """
    Recursively traverse the doc2dict output and extract text segments
    with their full hierarchical path (e.g. 'Part I > Item 1 > Business').

    Each segment is a dict with keys: 'path', 'type', 'content'.
    """
    if segments is None:
        segments = []

    if not isinstance(dct, dict):
        return segments

    # Build the current path from 'title' if present
    current_path = path
    if "title" in dct and isinstance(dct["title"], str):
        title = dct["title"].strip()
        if title:
            current_path = f"{path} > {title}" if path else title

    # Extract text content
    for key in ("text", "textsmall"):
        if key in dct and isinstance(dct[key], str):
            text = dct[key].strip()
            if text:
                segments.append({
                    "path": current_path or "(root)",
                    "type": key,
                    "content": text,
                })

    # Extract table content — convert to a readable string representation
    if "table" in dct:
        table = dct["table"]
        table_parts = []

        if isinstance(table, dict):
            if table.get("title"):
                table_parts.append(str(table["title"]))
            if table.get("preamble"):
                table_parts.append(str(table["preamble"]))
            if table.get("data"):
                for row in table["data"]:
                    table_parts.append(" | ".join(str(cell) for cell in row))
            if table.get("footnotes"):
                for fn in table["footnotes"]:
                    table_parts.append(str(fn))
            if table.get("postamble"):
                table_parts.append(str(table["postamble"]))
        elif isinstance(table, list):
            for row in table:
                table_parts.append(" | ".join(str(cell) for cell in row))

        table_text = "\n".join(table_parts).strip()
        if table_text:
            segments.append({
                "path": current_path or "(root)",
                "type": "table",
                "content": table_text,
            })

    # Recurse into nested contents
    contents = dct.get("contents", {})
    if isinstance(contents, dict):
        for key in contents:
            extract_segments(contents[key], current_path, segments)

    return segments


# Parse the filing HTML
parsed = html2dict(html_content)

# Handle the 'document' wrapper if present
root = parsed.get("document", parsed)
all_segments = []
if isinstance(root, dict):
    for key in root:
        extract_segments(root[key], segments=all_segments)

print(f"Extracted {len(all_segments):,} raw segments")

# Show a sample
for seg in all_segments[:5]:
    preview = seg["content"][:120].replace("\n", " ")
    print(f"\n[{seg['type']}] {seg['path']}")
    print(f"  {preview}...")

Extracted 768 raw segments

[table] UNITED STATES SECURITIES AND EXCHANGE COMMISSION > Washington, D.C. 20549
  Washington, D.C. 20549...

[table] FORM 10-K > (Mark One)
  (Mark One) ☑ | ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  | For the fiscal ye...

[table] FORM 10-K > Commission File Number: 000-06217
  Commission File Number: 000-06217...

[table] INTEL CORPORATION
  (Exact name of registrant as specified in its charter) Delaware (State or other jurisdiction of incorporation or organiz...

[table] INTEL CORPORATION
  Title of each class | Trading symbol | Name of each exchange on which registered Common stock, $0.001 par value | INTC |...


In [4]:
# =============================================================================
# Cell 4 — Chunk Long Segments (sentence-boundary aware)
# =============================================================================

def token_count(text):
    """Approximate token count using whitespace splitting."""
    return len(text.split())


def chunk_segment(segment, limit=CHUNK_TOKEN_LIMIT, tolerance=CHUNK_TOLERANCE):
    """
    Split a segment into chunks that respect the token limit.

    Sentences are never cut in half. A chunk is finalised when the next
    sentence would push it past the limit, even if the chunk ends at
    e.g. 476 tokens (within the ±tolerance band).

    Returns a list of segment dicts, each inheriting the original path.
    """
    content = segment["content"]
    total_tokens = token_count(content)

    # If the segment already fits, return it as-is
    if total_tokens <= limit:
        return [segment]

    # Split on sentence boundaries (full stop, exclamation mark, question mark)
    sentences = re.split(r"(?<=[.!?])\s+", content)

    chunks = []
    current_sentences = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = token_count(sentence)

        # If a single sentence exceeds the limit, keep it whole rather than
        # cutting mid-sentence — the tolerance band permits this.
        if current_tokens + sentence_tokens > limit + tolerance and current_sentences:
            chunks.append({
                "path": segment["path"],
                "type": segment["type"],
                "content": " ".join(current_sentences),
            })
            current_sentences = []
            current_tokens = 0

        current_sentences.append(sentence)
        current_tokens += sentence_tokens

    # Flush the remaining sentences
    if current_sentences:
        chunks.append({
            "path": segment["path"],
            "type": segment["type"],
            "content": " ".join(current_sentences),
        })

    return chunks


# Apply chunking to all segments
chunks = []
for seg in all_segments:
    chunks.extend(chunk_segment(seg))

print(f"Total chunks after splitting: {len(chunks):,}")

# Show token distribution
token_counts = [token_count(c["content"]) for c in chunks]
print(f"Token range: {min(token_counts)} – {max(token_counts)}")
print(f"Mean tokens per chunk: {sum(token_counts) / len(token_counts):.0f}")

# Show how many exceed the limit (should only be single long sentences)
over_limit = sum(1 for t in token_counts if t > CHUNK_TOKEN_LIMIT)
print(f"Chunks exceeding {CHUNK_TOKEN_LIMIT} tokens: {over_limit}")

Total chunks after splitting: 357
Token range: 1 – 550
Mean tokens per chunk: 86
Chunks exceeding 500 tokens: 5


In [5]:
# =============================================================================
# Cell 5 — Embed & Store in ChromaDB
# =============================================================================

# Load the embedding model on GPU
model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
print(f"Loaded '{EMBEDDING_MODEL_NAME}' on {DEVICE}")

# Prepare texts and metadata
texts = [c["content"] for c in chunks]
metadatas = [
    {
        "path": c["path"],
        "type": c["type"],
        "ticker": TICKER,
        "form_type": FORM_TYPE,
    }
    for c in chunks
]
ids = [f"{TICKER}_{FORM_TYPE}_{i}" for i in range(len(chunks))]

# Generate embeddings (batch encoding on GPU)
print(f"Embedding {len(texts):,} chunks...")
embeddings = model.encode(texts, batch_size=8, show_progress_bar=True, convert_to_numpy=True)
print(f"Embedding shape: {embeddings.shape}")

# Initialise ChromaDB (persistent local storage)
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection_name = f"{TICKER}_{FORM_TYPE}".lower().replace("-", "_")

# Delete existing collection if present (for clean re-runs)
try:
    chroma_client.delete_collection(name=collection_name)
except Exception:
    pass

collection = chroma_client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"},
)

# Upsert into ChromaDB
collection.add(
    ids=ids,
    embeddings=embeddings.tolist(),
    documents=texts,
    metadatas=metadatas,
)
print(f"\nStored {collection.count():,} chunks in collection '{collection_name}'")

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

Loaded 'google/embeddinggemma-300m' on cuda
Embedding 357 chunks...


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Embedding shape: (357, 768)

Stored 357 chunks in collection 'aapl_10_k'


In [6]:
# =============================================================================
# Cell 6 — Semantic Search Function
# =============================================================================

def semantic_search(query, top_k=TOP_K):
    """Embed the query and return the top-k most relevant chunks."""
    query_embedding = model.encode([query], convert_to_numpy=True).tolist()

    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k,
        include=["documents", "metadatas", "distances"],
    )

    print(f"Query: \"{query}\"\n")
    print(f"{'Rank':<5} {'Score':<8} {'Section Path'}")
    print("=" * 80)

    for i in range(len(results["ids"][0])):
        # ChromaDB returns cosine distance; similarity = 1 - distance
        distance = results["distances"][0][i]
        similarity = 1 - distance
        path = results["metadatas"][0][i]["path"]
        doc = results["documents"][0][i]
        seg_type = results["metadatas"][0][i]["type"]

        print(f"\n#{i + 1:<4} {similarity:.4f}  [{seg_type}] {path}")
        print(f"     {doc[:200]}...")

    return results

In [7]:
# =============================================================================
# Cell 7 — Demo Queries
# =============================================================================

# Example queries — adjust to match the filing content
_ = semantic_search("What are the main risk factors?")
print("\n" + "=" * 80 + "\n")
_ = semantic_search("Revenue and net income figures")
print("\n" + "=" * 80 + "\n")
_ = semantic_search("Supply chain and manufacturing operations")

Query: "What are the main risk factors?"

Rank  Score    Section Path

#1    0.4158  [text] TABLE OF CONTENTS > Internal-Use Software > Item 7A.    Quantitative and Qualitative Disclosures About Market Risk
     The Company is exposed to economic risk from interest rates and foreign exchange rates. The Company uses various strategies to manage these risks; however, they may still impact the Company’s consolid...

#2    0.4064  [table] introduction
     introduction...

#3    0.3901  [text] TABLE OF CONTENTS > Wearables, Home and Accessories > Item 1C.    Cybersecurity
     For a discussion of the Company’s cybersecurity-related risks, see Item 1A of this Form 10-K under the heading “Risk Factors.”...

#4    0.3832  [text] TABLE OF CONTENTS > Wearables, Home and Accessories > Item 1A.    Risk Factors
     The following summarizes factors that could have a material adverse effect on the Company’s business, reputation, results of operations, financial condition and stock price. The Compan