In [3]:
%pip install lxml

Collecting lxml
  Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl (4.0 MB)
Installing collected packages: lxml
Successfully installed lxml-6.0.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import logging
import json
from pathlib import Path
from typing import Optional, List, Dict
from lxml import etree
import pandas as pd

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
log = logging.getLogger("xbrl_extract")


In [5]:
def find_instance_file(raw_root: Path, company_hint: Optional[str] = None) -> Optional[Path]:
    """Find the main XBRL instance document (facts) under raw_root."""
    patterns = ["**/*_htm.xml", "**/*-htm.xml", "**/*_xbrl.xml", "**/*-xbrl.xml"]
    if company_hint:
        patterns = [f"**/*{company_hint}*_htm.xml", f"**/*{company_hint}*-htm.xml"] + patterns
    for pat in patterns:
        matches = list(raw_root.glob(pat))
        if matches:
            return matches[0]
    return None


In [6]:
def parse_facts(instance_file: Path) -> List[Dict]:
    """Parse facts from an XBRL instance file."""
    parser = etree.XMLParser(remove_comments=True, recover=True, huge_tree=True)
    tree = etree.parse(str(instance_file), parser=parser)
    root = tree.getroot()

    facts = []
    for elem in root.iter():
        if elem.get("contextRef"):  # treat as fact
            qname = elem.tag.split("}")[-1]  # strip namespace
            value = elem.text.strip() if elem.text else None
            facts.append({
                "qname": qname,
                "value_raw": value,
                "contextRef": elem.get("contextRef"),
                "unitRef": elem.get("unitRef"),
                "decimals": elem.get("decimals"),
                "precision": elem.get("precision")
            })
    return facts


In [8]:
# Define raw folder
raw_root = Path("../data/raw").resolve()

# Locate instance file
instance_file = find_instance_file(raw_root, company_hint="nvda")
if not instance_file:
    raise FileNotFoundError("No XBRL instance file found under ../data/raw/")
else:
    log.info(f"Parsing instance: {instance_file}")

# Parse facts
facts = parse_facts(instance_file)
df = pd.DataFrame(facts)

# Save outputs
doc_stem = instance_file.parent.name
out_dir = Path("../data/xbrl") / doc_stem
out_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(out_dir / "facts.csv", index=False)
with open(out_dir / "facts.json", "w", encoding="utf-8") as f:
    json.dump(facts, f, indent=2, ensure_ascii=False)

log.info(f"Extracted {len(facts)} facts → {out_dir}")
df.head(10)  # show preview


2025-09-25 18:18:44,193 - INFO - Parsing instance: C:\Users\kritt\OneDrive\Documents\GitHub\Project-Lantern-PDFPARSER\data\raw\NVIDIA_10-K_2024-02-21\nvda-20240128_htm.xml
2025-09-25 18:18:44,378 - INFO - Extracted 1387 facts → ..\data\xbrl\NVIDIA_10-K_2024-02-21


Unnamed: 0,qname,value_raw,contextRef,unitRef,decimals,precision
0,EntityCentralIndexKey,0001045810,c-1,,,
1,DocumentFiscalYearFocus,2024,c-1,,,
2,DocumentFiscalPeriodFocus,FY,c-1,,,
3,AmendmentFlag,false,c-1,,,
4,TrdArrDuration,P350D,c-4,,,
5,PropertyPlantAndEquipmentUsefulLife,P3Y,c-46,,,
6,PropertyPlantAndEquipmentUsefulLife,P4Y,c-47,,,
7,PropertyPlantAndEquipmentUsefulLife,P5Y,c-49,,,
8,WarrantyLiabilityTermOfWarranties,P1Y,c-52,,,
9,PropertyPlantAndEquipmentUsefulLife,P3Y,c-54,,,


In [9]:
def parse_contexts(instance_file: Path) -> Dict[str, dict]:
    """Parse <context> elements: entity + period + dimensions."""
    parser = etree.XMLParser(remove_comments=True, recover=True, huge_tree=True)
    tree = etree.parse(str(instance_file), parser=parser)
    root = tree.getroot()

    nsmap = {k: v for k, v in root.nsmap.items() if k}
    xbrli_uri = nsmap.get("xbrli", "http://www.xbrl.org/2003/instance")

    contexts = {}
    for ctx in root.findall(f".//{{{xbrli_uri}}}context"):
        cid = ctx.get("id")

        # entity identifier
        ent_ident = ctx.find(f".//{{{xbrli_uri}}}entity/{{{xbrli_uri}}}identifier")
        entity = ent_ident.text.strip() if ent_ident is not None and ent_ident.text else None

        # period
        instant = ctx.find(f".//{{{xbrli_uri}}}period/{{{xbrli_uri}}}instant")
        start = ctx.find(f".//{{{xbrli_uri}}}period/{{{xbrli_uri}}}startDate")
        end = ctx.find(f".//{{{xbrli_uri}}}period/{{{xbrli_uri}}}endDate")

        contexts[cid] = {
            "entity": entity,
            "instant": instant.text.strip() if instant is not None else None,
            "startDate": start.text.strip() if start is not None else None,
            "endDate": end.text.strip() if end is not None else None,
        }
    return contexts


In [10]:
# Parse contexts
contexts = parse_contexts(instance_file)
log.info(f"Parsed {len(contexts)} contexts")

# Merge with facts DataFrame
df["entity"] = df["contextRef"].map(lambda c: contexts.get(c, {}).get("entity"))
df["instant"] = df["contextRef"].map(lambda c: contexts.get(c, {}).get("instant"))
df["startDate"] = df["contextRef"].map(lambda c: contexts.get(c, {}).get("startDate"))
df["endDate"] = df["contextRef"].map(lambda c: contexts.get(c, {}).get("endDate"))

df.head(15)


2025-09-25 18:20:58,505 - INFO - Parsed 280 contexts


Unnamed: 0,qname,value_raw,contextRef,unitRef,decimals,precision,entity,instant,startDate,endDate
0,EntityCentralIndexKey,0001045810,c-1,,,,1045810,,2023-01-30,2024-01-28
1,DocumentFiscalYearFocus,2024,c-1,,,,1045810,,2023-01-30,2024-01-28
2,DocumentFiscalPeriodFocus,FY,c-1,,,,1045810,,2023-01-30,2024-01-28
3,AmendmentFlag,false,c-1,,,,1045810,,2023-01-30,2024-01-28
4,TrdArrDuration,P350D,c-4,,,,1045810,,2023-10-30,2024-01-28
5,PropertyPlantAndEquipmentUsefulLife,P3Y,c-46,,,,1045810,2023-01-31,,
6,PropertyPlantAndEquipmentUsefulLife,P4Y,c-47,,,,1045810,2023-02-28,,
7,PropertyPlantAndEquipmentUsefulLife,P5Y,c-49,,,,1045810,2023-01-31,,
8,WarrantyLiabilityTermOfWarranties,P1Y,c-52,,,,1045810,,2023-01-30,2024-01-28
9,PropertyPlantAndEquipmentUsefulLife,P3Y,c-54,,,,1045810,2024-01-28,,


In [11]:
def parse_units(instance_file: Path) -> Dict[str, str]:
    """Parse <unit> elements into human-readable measures (USD, shares, etc)."""
    parser = etree.XMLParser(remove_comments=True, recover=True, huge_tree=True)
    tree = etree.parse(str(instance_file), parser=parser)
    root = tree.getroot()

    nsmap = {k: v for k, v in root.nsmap.items() if k}
    xbrli_uri = nsmap.get("xbrli", "http://www.xbrl.org/2003/instance")

    units = {}
    for u in root.findall(f".//{{{xbrli_uri}}}unit"):
        uid = u.get("id")
        measure = u.find(f".//{{{xbrli_uri}}}measure")
        if uid and measure is not None and measure.text:
            units[uid] = measure.text.strip()
    return units


In [12]:
# Parse units
units = parse_units(instance_file)
log.info(f"Parsed {len(units)} units")

# Merge with DataFrame
df["unit"] = df["unitRef"].map(lambda u: units.get(u))

df.head(20)


2025-09-25 18:23:19,175 - INFO - Parsed 6 units


Unnamed: 0,qname,value_raw,contextRef,unitRef,decimals,precision,entity,instant,startDate,endDate,unit
0,EntityCentralIndexKey,0001045810,c-1,,,,1045810,,2023-01-30,2024-01-28,
1,DocumentFiscalYearFocus,2024,c-1,,,,1045810,,2023-01-30,2024-01-28,
2,DocumentFiscalPeriodFocus,FY,c-1,,,,1045810,,2023-01-30,2024-01-28,
3,AmendmentFlag,false,c-1,,,,1045810,,2023-01-30,2024-01-28,
4,TrdArrDuration,P350D,c-4,,,,1045810,,2023-10-30,2024-01-28,
5,PropertyPlantAndEquipmentUsefulLife,P3Y,c-46,,,,1045810,2023-01-31,,,
6,PropertyPlantAndEquipmentUsefulLife,P4Y,c-47,,,,1045810,2023-02-28,,,
7,PropertyPlantAndEquipmentUsefulLife,P5Y,c-49,,,,1045810,2023-01-31,,,
8,WarrantyLiabilityTermOfWarranties,P1Y,c-52,,,,1045810,,2023-01-30,2024-01-28,
9,PropertyPlantAndEquipmentUsefulLife,P3Y,c-54,,,,1045810,2024-01-28,,,


In [13]:
# Key US-GAAP concepts to validate
key_metrics = [
    "Revenues",
    "NetIncomeLoss",
    "EarningsPerShareBasic",
    "EarningsPerShareDiluted",
    "Assets",
    "Liabilities",
    "StockholdersEquity"
]

df_key = df[df["qname"].isin(key_metrics)].copy()

# Sort by concept + endDate for readability
df_key = df_key.sort_values(["qname", "endDate"])

df_key


Unnamed: 0,qname,value_raw,contextRef,unitRef,decimals,precision,entity,instant,startDate,endDate,unit
166,Assets,65728000000,c-9,usd,-6,,0001045810,2024-01-28,,,iso4217:USD
167,Assets,41182000000,c-10,usd,-6,,0001045810,2023-01-29,,,iso4217:USD
105,EarningsPerShareBasic,3.91,c-8,usdPerShare,2,,0001045810,,2021-02-01,2022-01-30,iso4217:USD
595,EarningsPerShareBasic,3.91,c-8,usdPerShare,2,,0001045810,,2021-02-01,2022-01-30,iso4217:USD
104,EarningsPerShareBasic,1.76,c-7,usdPerShare,2,,0001045810,,2022-01-31,2023-01-29,iso4217:USD
...,...,...,...,...,...,...,...,...,...,...,...
301,StockholdersEquity,13132000000,c-42,usd,-6,,0001045810,2024-01-28,,,iso4217:USD
302,StockholdersEquity,0,c-43,usd,-6,,0001045810,2024-01-28,,,iso4217:USD
303,StockholdersEquity,27000000,c-44,usd,-6,,0001045810,2024-01-28,,,iso4217:USD
304,StockholdersEquity,29817000000,c-45,usd,-6,,0001045810,2024-01-28,,,iso4217:USD


In [14]:
import re

def try_parse_numeric(val: str):
    if val is None:
        return None
    s = str(val).strip().replace(",", "")
    # handle (1234) as -1234
    if re.fullmatch(r"\(\s*\d+(\.\d+)?\s*\)", s):
        s = "-" + s.strip("()")
    try:
        return float(s)
    except ValueError:
        return None

df_key["value_num"] = df_key["value_raw"].map(try_parse_numeric)

df_key[["qname", "value_raw", "value_num", "unit", "startDate", "endDate"]].head(15)


Unnamed: 0,qname,value_raw,value_num,unit,startDate,endDate
166,Assets,65728000000.0,65728000000.0,iso4217:USD,,
167,Assets,41182000000.0,41182000000.0,iso4217:USD,,
105,EarningsPerShareBasic,3.91,3.91,iso4217:USD,2021-02-01,2022-01-30
595,EarningsPerShareBasic,3.91,3.91,iso4217:USD,2021-02-01,2022-01-30
104,EarningsPerShareBasic,1.76,1.76,iso4217:USD,2022-01-31,2023-01-29
594,EarningsPerShareBasic,1.76,1.76,iso4217:USD,2022-01-31,2023-01-29
103,EarningsPerShareBasic,12.05,12.05,iso4217:USD,2023-01-30,2024-01-28
422,EarningsPerShareBasic,0.05,0.05,iso4217:USD,2023-10-30,2024-01-28
593,EarningsPerShareBasic,12.05,12.05,iso4217:USD,2023-01-30,2024-01-28
108,EarningsPerShareDiluted,3.85,3.85,iso4217:USD,2021-02-01,2022-01-30


In [None]:
# Ensure datetime
df_key["endDate"] = pd.to_datetime(df_key["endDate"], errors="coerce")

rules = []

# EPS Diluted <= EPS Basic (latest period only)
eps_check = df_key[df_key["qname"].str.contains("EarningsPerShare")]
latest_end = df_key["endDate"].max()
eps_latest = eps_check[eps_check["endDate"] == latest_end]

if not eps_latest.empty:
    eps_basic = eps_latest[eps_latest["qname"] == "EarningsPerShareBasic"]["value_num"].max()
    eps_diluted = eps_latest[eps_latest["qname"] == "EarningsPerShareDiluted"]["value_num"].max()
    if eps_diluted is not None and eps_basic is not None:
        if eps_diluted <= eps_basic:
            rules.append("✅ EPS Diluted ≤ EPS Basic")
        else:
            rules.append("❌ EPS Diluted > EPS Basic")

# Assets = Liabilities + Equity
assets = df_key[df_key["qname"] == "Assets"]["value_num"].max()
equity = df_key[df_key["qname"] == "StockholdersEquity"]["value_num"].max()
liab = df_key[df_key["qname"] == "Liabilities"]["value_num"].max()
if all(v is not None for v in [assets, equity, liab]):
    if abs(assets - (equity + liab)) < 1e6:
        rules.append("✅ Assets = Liabilities + Equity (balance sheet matches)")
    else:
        rules.append("❌ Balance sheet mismatch")

rules



  eps_latest = eps_check[df_key["endDate"] == latest_end]


['EPS Diluted ≤ EPS Basic',
 'Assets = Liabilities + Equity (balance sheet matches)']