In [1]:
import sys
from pathlib import Path

root_dir = Path.cwd().parent  # Go one folder up from /notebooks
print("Root directory:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# testing
from src.data_prep.fsds_loader import load_fsds_from_zip
from src.data_prep.extractor_bs import extract_balance_sheets
from src.data_prep.extractor_is import extract_income_statements
from src.data_prep.extractor_cf import extract_cash_flows
from src.data_prep.silver_shares import build_silver_shares
print("Imports successful!")

Root directory: /Users/agalyaayyadurai/Documents/Dissertation/Automated-Equity-Valuation
Imports successful!


### Filings Dataframe per Zip

In [2]:
from pathlib import Path
from src.data_prep.filings_index import build_filings_index

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/filings/year_quarter=2025Q2/filings.parquet")

filings_df = build_filings_index(zip_path, out_path=out_path)

print(filings_df.shape)
print(filings_df['sic'].nunique())
print("Unique companies:", filings_df["cik"].nunique())
print("Form breakdown:\n", filings_df["form"].value_counts())

(1252, 9)


### Balance Sheet Dataframe per zip

In [5]:
from pathlib import Path
from src.data_prep.extractor_bs import extract_balance_sheets

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/bs/year_quarter=2025Q2/bs.parquet")

# Run the extractor
bs_df = extract_balance_sheets(zip_path)

print("Shape:", bs_df.shape)               # e.g. (50000, 15)
print("Columns:", bs_df.columns.tolist())  # should list adsh, tag, ddate, value, etc.
print("\nUnique filings:", bs_df['adsh'].nunique())
print("Unique tags:", bs_df['tag'].nunique())
print(bs_df.head())

# Save output neatly
out_path.parent.mkdir(parents=True, exist_ok=True)
bs_df.to_parquet(out_path, index=False)
print(f"Saved {len(bs_df):,} rows to {out_path}")


Extracted 1,278,344 BS facts from 2025q2.zip | filings=7,006
Shape: (1278344, 17)
Columns: ['adsh', 'tag', 'version', 'ddate', 'qtrs', 'uom', 'coreg', 'value', 'cik', 'name', 'form', 'fy', 'fp', 'period', 'filed', 'sic', 'source_zip']

Unique filings: 7006
Unique tags: 10674
                   adsh                                   tag       version  \
0  0000002488-25-000047                AccountsPayableCurrent  us-gaap/2025   
1  0000002488-25-000047                         AssetsCurrent  us-gaap/2025   
2  0000002488-25-000047               CommonStockSharesIssued  us-gaap/2025   
3  0000002488-25-000047            DeferredIncomeTaxAssetsNet  us-gaap/2025   
4  0000002488-25-000047  IntangibleAssetsNetExcludingGoodwill  us-gaap/2025   

      ddate qtrs     uom coreg         value  cik name form   fy   fp period  \
0  20250331    0     USD   NaN  2.206000e+09  NaN  NaN  NaN  NaN  NaN    NaN   
1  20250331    0     USD   NaN  2.159500e+10  NaN  NaN  NaN  NaN  NaN    NaN   
2  202503

### Income Statement Dataframe per zip

In [6]:
from pathlib import Path
from src.data_prep.extractor_is import extract_income_statements

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/is/year_quarter=2025Q2/is.parquet")
is_df = extract_income_statements(zip_path)

print("Shape:", is_df.shape)
print("Unique filings:", is_df["adsh"].nunique())
print("Unique tags:", is_df["tag"].nunique())
print(is_df.head())


out_path.parent.mkdir(parents=True, exist_ok=True)
is_df.to_parquet(out_path, index=False)

Extracted 711,229 IS facts from 2025q2.zip | filings=7,002
Shape: (711229, 17)
Unique filings: 7002
Unique tags: 9728
                   adsh                                    tag       version  \
0  0000002488-25-000047             CostOfGoodsAndServicesSold  us-gaap/2025   
1  0000002488-25-000047  IncomeLossFromEquityMethodInvestments  us-gaap/2025   
2  0000002488-25-000047                IncomeTaxExpenseBenefit  us-gaap/2025   
3  0000002488-25-000047                        InterestExpense  us-gaap/2025   
4  0000002488-25-000047                    OperatingIncomeLoss  us-gaap/2025   

      ddate qtrs  uom coreg         value  cik name form   fy   fp period  \
0  20240331    1  USD   NaN  2.913000e+09  NaN  NaN  NaN  NaN  NaN    NaN   
1  20250331    1  USD   NaN  7.000000e+06  NaN  NaN  NaN  NaN  NaN    NaN   
2  20250331    1  USD   NaN  1.230000e+08  NaN  NaN  NaN  NaN  NaN    NaN   
3  20240331    1  USD   NaN  2.500000e+07  NaN  NaN  NaN  NaN  NaN    NaN   
4  20240331    1

### Cash Flow Statement Dataframe per zip

In [2]:
from pathlib import Path
from src.data_prep.extractor_cf import extract_cash_flows

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/cf/year_quarter=2025Q2/cf.parquet")
cf_df = extract_cash_flows(zip_path)

print("Shape:", cf_df.shape)
print("Unique filings:", cf_df["adsh"].nunique())
print("Unique tags:", cf_df["tag"].nunique())
print(cf_df.head())


out_path.parent.mkdir(parents=True, exist_ok=True)
cf_df.to_parquet(out_path, index=False)
print(f"Saved {len(cf_df):,} rows to {out_path}")

Extracted 597,068 CF facts from 2025q2.zip | filings=6,954
Shape: (597068, 17)
Unique filings: 6954
Unique tags: 29467
                   adsh                                                tag  \
0  0000002488-25-000047  CashAndCashEquivalentsAtCarryingValueIncluding...   
1  0000002488-25-000047                                 InventoryWriteDown   
2  0000002488-25-000047         PaymentsToAcquirePropertyPlantAndEquipment   
3  0000002488-25-000047              PaymentsToAcquireShortTermInvestments   
4  0000002488-25-000047                 ProceedsFromIssuanceOfLongTermDebt   

        version     ddate qtrs  uom coreg         value  cik name form   fy  \
0  us-gaap/2025  20240331    0  USD   NaN  4.190000e+09  NaN  NaN  NaN  NaN   
1  us-gaap/2025  20250331    1  USD   NaN  0.000000e+00  NaN  NaN  NaN  NaN   
2  us-gaap/2025  20250331    1  USD   NaN  2.120000e+08  NaN  NaN  NaN  NaN   
3  us-gaap/2025  20240331    1  USD   NaN  4.330000e+08  NaN  NaN  NaN  NaN   
4  us-gaap/2025  

### SILVER - Shares dataframe per zip

In [2]:
from pathlib import Path

reload(tm)
from src.data_prep.silver_shares import build_silver_shares

zip_path = Path("../data/raw/2025q2.zip")
out = Path("../data/silver/shares/year_quarter=2025Q2/shares_wide.parquet")

shares_2025q2 = build_silver_shares(zip_path, out_path=out)
print(shares_2025q2.head())

                   adsh   cik                            name    fy    period  \
0  0000002488-25-000047  2488      ADVANCED MICRO DEVICES INC  2025  20250331   
1  0000002969-25-000027  2969  AIR PRODUCTS & CHEMICALS, INC.  2025  20250331   
2  0000003499-25-000009  3499                  ALEXANDERS INC  2025  20250331   
3  0000003545-25-000079  3545                     ALICO, INC.  2025  20250331   
4  0000003570-25-000049  3570             CHENIERE ENERGY INC  2025  20250331   

      filed  form   sic  CommonSharesAuth  CommonSharesIssuable  ...  \
0  20250507  10-Q  3674      2.250000e+09                   NaN  ...   
1  20250501  10-Q  2810               NaN                   NaN  ...   
2  20250505  10-Q  6798      1.000000e+07                   NaN  ...   
3  20250513  10-Q  0100      1.500000e+07                   NaN  ...   
4  20250508  10-Q  4924      4.800000e+08                   NaN  ...   

   TreasuryShares  TrustShares  UnitsAuth  UnitsIssued  UnitsOut  WASOBasic  \
0

### sugest tag map run

In [2]:
from pathlib import Path
from src.tools.suggest_tag_map import suggest_from_zip, save_suggestions

sug = suggest_from_zip(Path("../data/raw/2025q2.zip"), top_n=60)
save_suggestions(sug, Path("../logs/coverage/suggestions_2025Q2"))


### SILVER - Extract Balance Sheet into a dataframe