In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.argv = [sys.argv[0]]

In [4]:
from alphaledger.config import settings
from alphaledger.universe import load_from_yaml
from alphaledger.sec import EDGARFetcher, load_ticker_to_cik_mapping
import pathlib

edgar_fetcher = EDGARFetcher(settings.sec_user_agent)
universe_path = pathlib.Path(settings.universe_dir, settings.universe_name + ".yaml")

universe = load_from_yaml(universe_path)
uni_tickers = universe.get_tickers()

print(uni_tickers)

INFO:alphaledger.universe:Automatically triggering filings load for universe 'Cloud Computing Giants'...
INFO:alphaledger.universe:No recent filings file found at /Users/forrest.murray/Documents/alphaledger/output/filings/Cloud Computing Giants/sec_filings.parquet or force_fetch=True. Fetching from SEC.
INFO:alphaledger.universe:Fetching filings from SEC EDGAR...
INFO:alphaledger.universe:Using 16 CIKs for fetching.
INFO:alphaledger.sec:Searching for AMZN (CIK: 0001018724) filings between 2020-2025
INFO:alphaledger.sec:Fetching filings index from: https://data.sec.gov/submissions/CIK0001018724.json
INFO:alphaledger.sec:Successfully fetched filings index for CIK 0001018724
INFO:alphaledger.sec:Found 6 10-K filings for AMZN between 2020-2025
INFO:alphaledger.sec:Searching for MSFT (CIK: 0000789019) filings between 2020-2025
INFO:alphaledger.sec:Fetching filings index from: https://data.sec.gov/submissions/CIK0000789019.json
INFO:alphaledger.sec:Successfully fetched filings index for CIK 

['AMZN', 'MSFT', 'GOOGL', 'IBM', 'ORCL', 'CRM', 'BABA', 'CSCO', 'AVGO', 'SAP', 'ADBE', 'WDAY', 'ZM', 'NOW', 'NET', 'DDOG']


In [5]:
ticker_to_cik_mapping = load_ticker_to_cik_mapping(uni_tickers)
filings_df = edgar_fetcher.fetch_filings_for_universe(universe, ticker_to_cik_mapping)

INFO:alphaledger.sec:Searching for AMZN (CIK: 0001018724) filings between 2020-2025
INFO:alphaledger.sec:Fetching filings index from: https://data.sec.gov/submissions/CIK0001018724.json
INFO:alphaledger.sec:Successfully fetched filings index for CIK 0001018724
INFO:alphaledger.sec:Found 6 10-K filings for AMZN between 2020-2025
INFO:alphaledger.sec:Searching for MSFT (CIK: 0000789019) filings between 2020-2025
INFO:alphaledger.sec:Fetching filings index from: https://data.sec.gov/submissions/CIK0000789019.json
INFO:alphaledger.sec:Successfully fetched filings index for CIK 0000789019
INFO:alphaledger.sec:Found 5 10-K filings for MSFT between 2020-2025
INFO:alphaledger.sec:Searching for GOOGL (CIK: 0001652044) filings between 2020-2025
INFO:alphaledger.sec:Fetching filings index from: https://data.sec.gov/submissions/CIK0001652044.json
INFO:alphaledger.sec:Successfully fetched filings index for CIK 0001652044
INFO:alphaledger.sec:Found 4 10-K filings for GOOGL between 2020-2025
INFO:alp

In [7]:
edgar_fetcher.save_filings_to_disk(filings_df, settings.output_dir / "filings")

Saved 73 filings to /Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.parquet


'/Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.parquet'

In [6]:
edgar_fetcher.save_filings_to_disk(filings_df, settings.output_dir / "filings", file_format="delta")

INFO:alphaledger.sec:Saved 73 filings to /Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.delta


'/Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.delta'

In [36]:
import pathlib
filings_df = edgar_fetcher.load_filings_from_disk("/Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.delta")

INFO:alphaledger.sec:Loading Delta file: /Users/forrest.murray/Documents/alphaledger/output/filings/sec_filings_universe.delta


In [7]:
import logging
from xbrl.cache import HttpCache
from xbrl.instance import XbrlParser, XbrlInstance

from alphaledger.config import settings
# just to see which files are downloaded
# logging.basicConfig(level=logging.INFO)

cache: HttpCache = HttpCache('./cache')
cache.set_headers({'From': 'forrest.murray@databricks.com', 'User-Agent': settings.sec_user_agent})
parser = XbrlParser(cache)

# schema_url = "https://www.sec.gov/Archives/edgar/data/1018724/000101872424000008/amzn-20231231.htm"
# inst: XbrlInstance = parser.parse_instance(schema_url)

In [42]:
from alphaledger.process_xbrl import IXBRLDocumentParser

ixbrl_parser = IXBRLDocumentParser("./cache/www.sec.gov/Archives/edgar/data/1018724/000101872424000008/amzn-20231231.htm", gaap_facts)

document = ixbrl_parser.parse()


TypeError: IXBRLDocumentParser.__init__() takes 2 positional arguments but 3 were given

In [13]:
from alphaledger.process_xbrl import (
    IXBRLDocumentParser,
    TARGET_SCHEMA_NUMERIC_POLARS,
    TARGET_SCHEMA_TEXT_POLARS
)
import polars as pl

# Initialize two accumulators with correct schemas
all_numeric_facts_df = pl.DataFrame(schema=TARGET_SCHEMA_NUMERIC_POLARS)
all_text_facts_df = pl.DataFrame(schema=TARGET_SCHEMA_TEXT_POLARS)

# Initialize parser once (assuming 'cache' is your HttpCache object)
doc_parser = IXBRLDocumentParser(cache)

for record in filings_df.iter_rows(named=True):
    url = record["xbrl_instance_url"]
    print(f"Processing: {url}")
    try:
        inst = parser.parse_instance(url)
        if not inst:
            print(f"  Skipping {url}: Instance parsing failed.")
            continue

        # Parse the document structure
        document = doc_parser.parse(xbrl_instance=inst)

        # Create the two separate DataFrames
        numeric_df = document.to_numeric_dataframe()
        text_df = document.to_text_dataframe()

        # Concatenate into respective accumulators
        if not numeric_df.is_empty():
            all_numeric_facts_df = pl.concat([all_numeric_facts_df, numeric_df], how="vertical") # Use 'vertical' - schemas should match
        if not text_df.is_empty():
            all_text_facts_df = pl.concat([all_text_facts_df, text_df], how="vertical") # Use 'vertical'

    except Exception as e:
        print(f"  ERROR processing {url}: {e}")
        # import traceback
        # traceback.print_exc()

print("Processing complete.")
print(f"Final Numeric DF shape: {all_numeric_facts_df.shape}")
print(f"Final Text DF shape: {all_text_facts_df.shape}")


Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872425000004/amzn-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872424000008/amzn-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872423000004/amzn-20221231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872422000005/amzn-20211231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872421000004/amzn-20201231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001018724/000101872420000004/amzn-20191231x10k.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000789019/000095017023035122/msft-20230630.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000789019/000156459022026876/msft-10k_20220630.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000789019/000156459021039151/msft-10k_20210630.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000789019/000156459020034944/msft-10k_20200630.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001652044/000165204425000014/goog-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001652044/000165204424000022/goog-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001652044/000165204423000016/goog-20221231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001652044/000165204422000019/goog-20211231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000005114325000015/ibm-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000005114324000012/ibm-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000155837023002376/ibm-20221231x10k.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000155837022001584/ibm-20211231x10k.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000155837021001489/ibm-20201231x10k.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000051143/000155837020001334/ibm-20191231x10k2af531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001341439/000095017024075605/orcl-20240531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001341439/000095017023028914/orcl-20230531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001341439/000156459022023675/orcl-10k_20220531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001341439/000156459021033616/orcl-10k_20210531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001341439/000156459020030125/orcl-10k_20200531.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001108524/000110852425000006/crm-20250131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001108524/000110852424000005/crm-20240131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001108524/000110852423000011/crm-20230131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000858877/000085887724000017/csco-20240727.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000858877/000085887723000023/csco-20230729.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000858877/000085887722000013/csco-20220730.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000858877/000085887721000013/csco-20210731.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000858877/000085887720000010/csco-2020725x10k.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001730168/000173016824000139/avgo-20241103.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001730168/000173016823000096/avgo-20231029.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001730168/000173016822000118/avgo-20221030.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001730168/000173016821000153/avgo-20211031.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001730168/000173016820000226/avgo-20201101.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634325000004/adbe-20241129.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634324000006/adbe-20231201.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634323000007/adbe-20221202.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634322000032/adbe-20211203.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634321000004/adbe-20201127.htm




Processing: https://www.sec.gov/Archives/edgar/data/0000796343/000079634320000013/adbe10kfy19.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781125000056/wday-20250131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781124000044/wday-20240131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781123000024/wday-20230131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781122000030/wday-20220131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781121000020/wday-20210131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001327811/000132781120000022/wday-20200131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552125000042/zm-20250131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552124000030/zm-20240131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552123000035/zm-20230131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552122000037/zm-20220131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552121000048/zm-20210131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001585521/000158552120000095/zm-20200131.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001373715/000137371525000010/now-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001373715/000137371524000030/now-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001373715/000137371523000035/now-20221231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001373715/000137371522000024/now-20211231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001373715/000137371521000061/now-20201231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733325000043/cloud-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733324000013/cloud-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733323000017/cloud-20221231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733322000008/cloud-20211231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733321000009/cloud-20201231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001477333/000147733320000010/cloud-20191231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001561550/000156155025000025/ddog-20241231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001561550/000156155024000009/ddog-20231231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001561550/000156155023000006/ddog-20221231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001561550/000156155022000009/ddog-20211231.htm




Processing: https://www.sec.gov/Archives/edgar/data/0001561550/000156459021009770/ddog-10k_20201231.htm
  ERROR processing https://www.sec.gov/Archives/edgar/data/0001561550/000156459021009770/ddog-10k_20201231.htm: not well-formed (invalid token): line 7, column 2
Processing complete.
Final Numeric DF shape: (42969, 13)
Final Text DF shape: (6620, 11)


In [14]:
all_numeric_facts_df.write_delta(settings.output_dir / "numeric_facts")

In [15]:
all_text_facts_df.write_delta(settings.output_dir / "text_facts")

In [51]:
from alphaledger.formatter import MarkdownFormatter

with open("output.md", "w") as f:
    f.write(document.to_string(MarkdownFormatter()))


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  


In [11]:
facts_df = document.to_dataframe(format="polars")
facts_df

concept_name,concept_namespace,fact_value,fact_type,section_name,context_entity,metadata,unit
str,str,f64,str,str,str,struct[1],str
"""DerivativeAssetStatementOfFina…","""elts""",,"""TextFact""","""Document Start""","""0001018724""",{null},
"""DerivativeAssetStatementOfFina…","""elts""",,"""TextFact""","""Document Start""","""0001018724""",{null},
"""PropertyPlantAndEquipmentUsefu…","""elts""",,"""TextFact""","""Document Start""","""0001018724""",{null},
"""FinanceLeaseRightOfUseAssetSta…","""elts""",,"""TextFact""","""Document Start""","""0001018724""",{null},
"""FinanceLeaseRightOfUseAssetSta…","""elts""",,"""TextFact""","""Document Start""","""0001018724""",{null},
…,…,…,…,…,…,…,…
"""Depreciation""","""elts""",9.8760e9,"""NumericFact""","""Document Start""","""0001018724""",{-6},"""iso4217:USD"""
"""Depreciation""","""elts""",1.2531e10,"""NumericFact""","""Document Start""","""0001018724""",{-6},"""iso4217:USD"""
"""Depreciation""","""elts""",2.2909e10,"""NumericFact""","""Document Start""","""0001018724""",{-6},"""iso4217:USD"""
"""Depreciation""","""elts""",2.4924e10,"""NumericFact""","""Document Start""","""0001018724""",{-6},"""iso4217:USD"""


In [17]:
from alphaledger.universe import Universe
from alphaledger.config import settings

universe = Universe(settings.universe_name)
# filings_df = universe.get_filings()

# numeric_df, text_df = process_filing_urls(filings_df)

INFO:alphaledger.universe:Loading universe definition from: /Users/forrest.murray/Documents/alphaledger/universes/sectors/cloud_computing.yaml
INFO:alphaledger.universe:Found existing filings metadata file: /Users/forrest.murray/Documents/alphaledger/output/sec_filings/Cloud_Computing_Giants.delta. Initializing LazyFrame.
INFO:alphaledger.universe:Initialized Universe 'Cloud_Computing_Giants' from /Users/forrest.murray/Documents/alphaledger/universes/sectors/cloud_computing.yaml with 16 securities. Filings metadata status: Detected.


In [18]:
from alphaledger.config import settings

settings.verbose = True
ibm_facts = universe.get_security_numeric_facts("IBM").collect()

INFO:alphaledger.universe:Cache miss: No cached numeric facts found for IBM at /Users/forrest.murray/Documents/alphaledger/output/numeric_facts/Cloud_Computing_Giants.delta. Processing required.
INFO:alphaledger.universe:Fetching numeric facts for IBM in universe 'Cloud_Computing_Giants'...
INFO:alphaledger.universe:Found 6 filings for IBM. Processing XBRL...
INFO:alphaledger.process_xbrl:[process_filing_urls_direct] Starting direct fact extraction.
INFO:alphaledger.process_xbrl:[_direct_fact_to_row] Fact EntityPublicFloat: Found InstantContext - Instant: 2024-06-30 (Type: <class 'datetime.date'>)
INFO:alphaledger.process_xbrl:[_direct_fact_to_row] Fact EntityCommonStockSharesOutstanding: Found InstantContext - Instant: 2025-02-10 (Type: <class 'datetime.date'>)
INFO:alphaledger.process_xbrl:[_direct_fact_to_row] Fact ValuationAllowancesAndReservesBalance: Found InstantContext - Instant: 2023-12-31 (Type: <class 'datetime.date'>)
INFO:alphaledger.process_xbrl:[_direct_fact_to_row] Fact

In [19]:
ibm_facts

concept_name,concept_namespace,fact_type,period_instant,period_start,period_end,context_id,context_entity,context_scenario,filing_date,report_date,fact_value,unit,metadata,ticker
str,str,str,date,date,date,str,str,str,date,date,f64,str,struct[2],str
"""EntityPublicFloat""","""2024""","""NumericFact""",2024-06-30,,,,,,2025-02-25,2024-12-31,1.5920e11,"""iso4217:USD""","{-8,null}","""IBM"""
"""EntityCommonStockSharesOutstan…","""2024""","""NumericFact""",2025-02-10,,,,,,2025-02-25,2024-12-31,9.27264332e8,"""xbrli:shares""","{null,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",2023-12-31,,,,,,2025-02-25,2024-12-31,4.31e8,"""iso4217:USD""","{-6,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",,2024-01-01,2024-12-31,,,,2025-02-25,2024-12-31,-1.1e7,"""iso4217:USD""","{-6,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",,2024-01-01,2024-12-31,,,,2025-02-25,2024-12-31,1.46e8,"""iso4217:USD""","{-6,null}","""IBM"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",2016-12-31,,,,,,2020-02-25,2019-12-31,4.81e8,"""iso4217:USD""","{-6,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",,2017-01-01,2017-12-31,,,,2020-02-25,2019-12-31,1.0060e9,"""iso4217:USD""","{-6,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",,2017-01-01,2017-12-31,,,,2020-02-25,2019-12-31,1.0560e9,"""iso4217:USD""","{-6,null}","""IBM"""
"""ValuationAllowancesAndReserves…","""elts""","""NumericFact""",,2017-01-01,2017-12-31,,,,2020-02-25,2019-12-31,2e7,"""iso4217:USD""","{-6,null}","""IBM"""


In [None]:
import polars as pl

# universe.fetch_or_load_filings()

# numeric_df.filter(pl.col("context_entity").is_in(["0001018724"]))\

# numeric_df.head(1)

INFO:alphaledger.universe:Found recent filings file (2 days old) at: /Users/forrest.murray/Documents/alphaledger/output/sec_filings_sectors/cloud_computing.delta
INFO:alphaledger.universe:Loading filings from /Users/forrest.murray/Documents/alphaledger/output/sec_filings_sectors/cloud_computing.delta...
INFO:alphaledger.sec:Loading Delta file: /Users/forrest.murray/Documents/alphaledger/output/sec_filings_sectors/cloud_computing.delta
INFO:alphaledger.universe:Successfully loaded 27 filings from disk.


In [15]:
universe.get_all_securities()

[]