In [1]:
import sys
from pathlib import Path

root_dir = Path.cwd().parent  # Go one folder up from /notebooks
print("Root directory:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# testing
from src.data_prep.bronze_extractor.fsds_loader import load_fsds_from_zip
from src.data_prep.bronze_extractor.extractor_bs import extract_balance_sheets
from src.data_prep.bronze_extractor.extractor_is import extract_income_statements
from src.data_prep.bronze_extractor.extractor_cf import extract_cash_flows
print("Imports successful!")

Root directory: /Users/agalyaayyadurai/Automated-Equity-Valuation
Imports successful!


### Filings Dataframe per Zip

In [2]:
from pathlib import Path
from src.data_prep.bronze_extractor.filings_index import build_filings_index

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/filings/year_quarter=2025Q2/filings.parquet")

filings_df = build_filings_index(zip_path, out_path=out_path)

print(filings_df.shape)
print(filings_df['sic'].nunique())
print("Unique companies:", filings_df["cik"].nunique())
print("Form breakdown:\n", filings_df["form"].value_counts())

(1252, 9)


### Balance Sheet Dataframe per zip

In [5]:
from pathlib import Path
from src.data_prep.bronze_extractor.extractor_bs import extract_balance_sheets

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/bs/year_quarter=2025Q2/bs.parquet")

# Run the extractor
bs_df = extract_balance_sheets(zip_path)

print("Shape:", bs_df.shape)               # e.g. (50000, 15)
print("Columns:", bs_df.columns.tolist())  # should list adsh, tag, ddate, value, etc.
print("\nUnique filings:", bs_df['adsh'].nunique())
print("Unique tags:", bs_df['tag'].nunique())
print(bs_df.head())

# Save output neatly
out_path.parent.mkdir(parents=True, exist_ok=True)
bs_df.to_parquet(out_path, index=False)
print(f"Saved {len(bs_df):,} rows to {out_path}")


Extracted 1,278,344 BS facts from 2025q2.zip | filings=7,006
Shape: (1278344, 17)
Columns: ['adsh', 'tag', 'version', 'ddate', 'qtrs', 'uom', 'coreg', 'value', 'cik', 'name', 'form', 'fy', 'fp', 'period', 'filed', 'sic', 'source_zip']

Unique filings: 7006
Unique tags: 10674
                   adsh                                   tag       version  \
0  0000002488-25-000047                AccountsPayableCurrent  us-gaap/2025   
1  0000002488-25-000047                         AssetsCurrent  us-gaap/2025   
2  0000002488-25-000047               CommonStockSharesIssued  us-gaap/2025   
3  0000002488-25-000047            DeferredIncomeTaxAssetsNet  us-gaap/2025   
4  0000002488-25-000047  IntangibleAssetsNetExcludingGoodwill  us-gaap/2025   

      ddate qtrs     uom coreg         value  cik name form   fy   fp period  \
0  20250331    0     USD   NaN  2.206000e+09  NaN  NaN  NaN  NaN  NaN    NaN   
1  20250331    0     USD   NaN  2.159500e+10  NaN  NaN  NaN  NaN  NaN    NaN   
2  202503

### Income Statement Dataframe per zip

In [6]:
from pathlib import Path
from src.data_prep.bronze_extractor.extractor_is import extract_income_statements

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/is/year_quarter=2025Q2/is.parquet")
is_df = extract_income_statements(zip_path)

print("Shape:", is_df.shape)
print("Unique filings:", is_df["adsh"].nunique())
print("Unique tags:", is_df["tag"].nunique())
print(is_df.head())


out_path.parent.mkdir(parents=True, exist_ok=True)
is_df.to_parquet(out_path, index=False)

Extracted 711,229 IS facts from 2025q2.zip | filings=7,002
Shape: (711229, 17)
Unique filings: 7002
Unique tags: 9728
                   adsh                                    tag       version  \
0  0000002488-25-000047             CostOfGoodsAndServicesSold  us-gaap/2025   
1  0000002488-25-000047  IncomeLossFromEquityMethodInvestments  us-gaap/2025   
2  0000002488-25-000047                IncomeTaxExpenseBenefit  us-gaap/2025   
3  0000002488-25-000047                        InterestExpense  us-gaap/2025   
4  0000002488-25-000047                    OperatingIncomeLoss  us-gaap/2025   

      ddate qtrs  uom coreg         value  cik name form   fy   fp period  \
0  20240331    1  USD   NaN  2.913000e+09  NaN  NaN  NaN  NaN  NaN    NaN   
1  20250331    1  USD   NaN  7.000000e+06  NaN  NaN  NaN  NaN  NaN    NaN   
2  20250331    1  USD   NaN  1.230000e+08  NaN  NaN  NaN  NaN  NaN    NaN   
3  20240331    1  USD   NaN  2.500000e+07  NaN  NaN  NaN  NaN  NaN    NaN   
4  20240331    1

### Cash Flow Statement Dataframe per zip

In [2]:
from pathlib import Path
from src.data_prep.bronze_extractor.extractor_cf import extract_cash_flows

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/bronze/cf/year_quarter=2025Q2/cf.parquet")
cf_df = extract_cash_flows(zip_path)

print("Shape:", cf_df.shape)
print("Unique filings:", cf_df["adsh"].nunique())
print("Unique tags:", cf_df["tag"].nunique())
print(cf_df.head())


out_path.parent.mkdir(parents=True, exist_ok=True)
cf_df.to_parquet(out_path, index=False)
print(f"Saved {len(cf_df):,} rows to {out_path}")

Extracted 597,068 CF facts from 2025q2.zip | filings=6,954
Shape: (597068, 17)
Unique filings: 6954
Unique tags: 29467
                   adsh                                                tag  \
0  0000002488-25-000047  CashAndCashEquivalentsAtCarryingValueIncluding...   
1  0000002488-25-000047                                 InventoryWriteDown   
2  0000002488-25-000047         PaymentsToAcquirePropertyPlantAndEquipment   
3  0000002488-25-000047              PaymentsToAcquireShortTermInvestments   
4  0000002488-25-000047                 ProceedsFromIssuanceOfLongTermDebt   

        version     ddate qtrs  uom coreg         value  cik name form   fy  \
0  us-gaap/2025  20240331    0  USD   NaN  4.190000e+09  NaN  NaN  NaN  NaN   
1  us-gaap/2025  20250331    1  USD   NaN  0.000000e+00  NaN  NaN  NaN  NaN   
2  us-gaap/2025  20250331    1  USD   NaN  2.120000e+08  NaN  NaN  NaN  NaN   
3  us-gaap/2025  20240331    1  USD   NaN  4.330000e+08  NaN  NaN  NaN  NaN   
4  us-gaap/2025  

### SILVER - Shares dataframe per zip

In [2]:
from pathlib import Path

reload(tm)
from src.data_prep.silver_shares import build_silver_shares

zip_path = Path("../data/raw/2025q2.zip")
out = Path("../data/silver/shares/year_quarter=2025Q2/shares_wide.parquet")

shares_2025q2 = build_silver_shares(zip_path, out_path=out)
print(shares_2025q2.head())

                   adsh   cik                            name    fy    period  \
0  0000002488-25-000047  2488      ADVANCED MICRO DEVICES INC  2025  20250331   
1  0000002969-25-000027  2969  AIR PRODUCTS & CHEMICALS, INC.  2025  20250331   
2  0000003499-25-000009  3499                  ALEXANDERS INC  2025  20250331   
3  0000003545-25-000079  3545                     ALICO, INC.  2025  20250331   
4  0000003570-25-000049  3570             CHENIERE ENERGY INC  2025  20250331   

      filed  form   sic  CommonSharesAuth  CommonSharesIssuable  ...  \
0  20250507  10-Q  3674      2.250000e+09                   NaN  ...   
1  20250501  10-Q  2810               NaN                   NaN  ...   
2  20250505  10-Q  6798      1.000000e+07                   NaN  ...   
3  20250513  10-Q  0100      1.500000e+07                   NaN  ...   
4  20250508  10-Q  4924      4.800000e+08                   NaN  ...   

   TreasuryShares  TrustShares  UnitsAuth  UnitsIssued  UnitsOut  WASOBasic  \
0

### sugest tag map run

In [2]:
from pathlib import Path
from src.data_prep.tools.suggest_tag_map import suggest_from_zip, save_suggestions

sug = suggest_from_zip(Path("../data/raw/2025q2.zip"), top_n=60)
save_suggestions(sug, Path("../logs/coverage/suggestions_2025Q2"))


### SILVER - Extract Balance Sheet into a dataframe

In [2]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer_bs import transform_balance_sheet_to_wide
from src.data_prep.config.tag_map_min import BS_NEW

# Example usage
zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/silver/bs/year_quarter=2025Q2/bs.parquet")

bs_df, unknown = transform_balance_sheet_to_wide(zip_path, tag_map=BS_NEW, out_path=out_path)

print(bs_df.shape)
print(bs_df.columns.to_list())
print(bs_df[['TotalAssets', 'TotalLiabilities', 'ShareholdersEquity']])
print(unknown["tag"].value_counts().head(25))

Extracted 1,278,344 BS facts from 2025q2.zip | filings=7,006
(1096, 38)
['adsh', 'cik', 'name', 'fy', 'filed', 'period', 'sic', 'APIC', 'AccountsPayable', 'AccountsReceivable', 'AccumulatedOCI', 'CashAndCashEquivalents', 'CommonStockParValue', 'CommonStockValue', 'Goodwill', 'Intangibles', 'Inventory', 'LeaseLiabilityCurrent', 'LeaseLiabilityNoncurrent', 'LeaseROUAsset', 'LongTermDebt', 'NoncontrollingInterest', 'OtherCurrentLiabilities', 'OtherNoncurrentLiabilities', 'PPandE', 'PreferredStockParValue', 'PreferredStockValue', 'RetainedEarnings', 'ShareholdersEquity', 'ShortTermDebt', 'ShortTermInvestments', 'TemporaryEquity', 'TotalAssets', 'TotalCurrentAssets', 'TotalCurrentLiabilities', 'TotalLiabilities', 'TotalNoncurrentAssets', 'TreasuryStock']
      TotalAssets  TotalLiabilities  ShareholdersEquity
130  7.767200e+07      1.065200e+07        5.314000e+07
194  8.175870e+08               NaN        5.088130e+08
716  1.859600e+10      8.521000e+09        1.007500e+10
119  1.082300e+1

In [6]:
import pandas as pd
# bs = bs_df.copy()

# # 1) Balance sheet identity: Assets ≈ Liabilities + Equity
# bs["liab_plus_equity"] = bs["TotalLiabilities"].fillna(0) + bs["ShareholdersEquity"].fillna(0)
# bs["bs_diff"] = (bs["TotalAssets"] - bs["liab_plus_equity"]).abs()
# bad_bs = bs[bs["bs_diff"] > 1e-3]  # tolerance; adjust for rounding
# print("Out-of-balance filings:", len(bad_bs))

# # 2) Debt composition sanity
# bs["TotalDebt_calc"] = bs["ShortTermDebt"].fillna(0) + bs["LongTermDebt"].fillna(0)

# # 3) Working capital & current ratio (handy for modeling)
# bs["WorkingCapital"] = bs["TotalCurrentAssets"].fillna(0) - bs["TotalCurrentLiabilities"].fillna(0)
# bs["CurrentRatio"] = bs["TotalCurrentAssets"] / bs["TotalCurrentLiabilities"]

# # 4) Intangibles share of assets (useful flag)
# bs["IntangiblesShare"] = (bs["Goodwill"].fillna(0) + bs["Intangibles"].fillna(0)) / bs["TotalAssets"]

# print(bs[["adsh","cik","fy","bs_diff","CurrentRatio","IntangiblesShare"]].head())

# bs_df["bs_diff"] = abs(bs_df["TotalAssets"] - (bs_df["TotalLiabilities"] + bs_df["ShareholdersEquity"]))
# (bs_df["bs_diff"] < 1000).mean()

bs = bs_df.copy()
bs["bs_diff"] = (bs["TotalAssets"] - (bs["TotalLiabilities"].fillna(0) + bs["ShareholdersEquity"].fillna(0) + + bs["TemporaryEquity"].fillna(0))).abs()
print("Out-of-balance (>$1k):", (bs["bs_diff"] > 1_000).sum())

balanced_ratio = (abs(bs["TotalAssets"] - (bs["TotalLiabilities"].fillna(0) + bs["ShareholdersEquity"].fillna(0) + bs["TemporaryEquity"].fillna(0))) < 1000).mean()
print(f"Balanced ratio: {balanced_ratio:.1%}")


Out-of-balance (>$1k): 823
Balanced ratio: 22.7%


In [5]:
dupes = bs_df.groupby(["cik", "fy"]).size().reset_index(name="count")
dupes[dupes["count"] > 1]

Unnamed: 0,cik,fy,count


### SILVER - Transform Income Statement into wide DF

In [3]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer_is import transform_income_statement_to_wide

# zip_path = Path("../data/raw/2025q2.zip")
# out_path = Path("../data/silver/is/year_quarter=2025Q2/is.parquet")

# is_wide, is_unknown = transform_income_statement_to_wide(zip_path, out_path=out_path, return_unknown=True)
print(is_unknown["tag"].value_counts().head(25))
is_wide.shape, is_wide.head()


tag
EarningsPerShareDiluted                                                                   733
EarningsPerShareBasic                                                                     726
InvestmentIncomeInterest                                                                  185
NetIncomeLossAttributableToNoncontrollingInterest                                         170
ProfessionalFees                                                                          156
FairValueAdjustmentOfWarrants                                                             142
NetIncomeLossAvailableToCommonStockholdersBasic                                           122
RestructuringCharges                                                                      115
LaborAndRelatedExpense                                                                     92
AssetImpairmentCharges                                                                     81
IncomeLossFromContinuingOperationsPerBasicShare         

((658, 24),
                      adsh      cik                              name    fy  \
 97   0000950170-25-091151  1000045           OLD MARKET CAPITAL CORP  2024   
 89   0000950170-25-083705  1002047                      NETAPP, INC.  2025   
 303  0001437749-25-017409  1004989                  SPAR GROUP, INC.  2024   
 99   0001005229-25-000149  1005229            COLUMBUS MCKINNON CORP  2024   
 302  0001437749-25-014208  1008586  STREAMLINE HEALTH SOLUTIONS INC.  2024   
 
         filed    period   sic  ComprehensiveIncome  CostOfRevenue  \
 97   20250630  20250331  6153                  NaN            NaN   
 89   20250609  20250430  3572                  NaN   1.959000e+09   
 303  20250516  20241231  7389           -4412000.0   1.583570e+08   
 99   20250528  20250331  3531                  NaN   6.373470e+08   
 302  20250502  20250131  7373                  NaN   6.804000e+06   
 
      DebtExtinguishmentGainLoss  ...  IncomeTaxExpense  InterestExpense  \
 97           

In [5]:
isw = is_wide.copy()
# if your canon names are: Revenues, CostOfRevenue, GrossProfit, OperatingExpenses, OperatingIncomeLoss, NetIncomeLoss
isw["gross_calc"] = isw["Revenue"].fillna(0) - isw["CostOfRevenue"].fillna(0)
isw["gross_err"] = (isw["gross_calc"] - isw["GrossProfit"].fillna(0)).abs()
print("Median gross mismatch:", isw["gross_err"].median())

Median gross mismatch: 0.0


### SILVER - Transform Cash FLow into wide DF

In [6]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer_cf import transform_cash_flow_to_wide

# zip_path = Path("../data/raw/2025q2.zip")
# out_path = Path("../data/silver/cf/year_quarter=2025Q2/cf.parquet")

# cf_wide, cf_unknown = transform_cash_flow_to_wide(zip_path, out_path=out_path, return_unknown=True)
print(cf_wide.columns.to_list())
cf_wide.shape, cf_wide.head()

['adsh', 'cik', 'name', 'fy', 'filed', 'period', 'sic', 'AcquireIntangibles', 'BusinessAcquisitions', 'CFF', 'CFI', 'CFO', 'CapEx', 'ChangeInAP', 'ChangeInAR', 'ChangeInAccruedLiabilities', 'ChangeInDeferredRevenue', 'ChangeInInventory', 'ChangeInPrepaidAndOther', 'DebtIssuance', 'DebtRepayment', 'DepAmortCF', 'EquityIssuance', 'FXEffect', 'IncomeTaxesPaid', 'InterestPaid', 'ProceedsFromSalePPE', 'ROUAssetNoncash', 'ShareRepurchase', 'StockBasedComp']


((657, 30),
                      adsh      cik                              name    fy  \
 98   0000950170-25-091151  1000045           OLD MARKET CAPITAL CORP  2024   
 90   0000950170-25-083705  1002047                      NETAPP, INC.  2025   
 304  0001437749-25-017409  1004989                  SPAR GROUP, INC.  2024   
 100  0001005229-25-000149  1005229            COLUMBUS MCKINNON CORP  2024   
 303  0001437749-25-014208  1008586  STREAMLINE HEALTH SOLUTIONS INC.  2024   
 
         filed    period   sic  AcquireIntangibles  BusinessAcquisitions  \
 98   20250630  20250331  6153                 NaN                   NaN   
 90   20250609  20250430  3572                 NaN                   0.0   
 304  20250516  20241231  7389                 NaN                   NaN   
 100  20250528  20250331  3531                 NaN                   0.0   
 303  20250502  20250131  7373                 NaN                   NaN   
 
              CFF  ...  DebtRepayment   DepAmortCF  Eq

In [7]:
cf = cf_wide.copy()

cf["calc_delta_cash"] = cf[["CFO", "CFI", "CFF"]].sum(axis=1, skipna=True)
cf["delta_cash_err"] = cf["calc_delta_cash"].abs()

print("Median |CFO+CFI+CFF| (proxy for ΔCash):", cf["delta_cash_err"].median())
cf[["CFO", "CFI", "CFF", "calc_delta_cash"]].head()

Median |CFO+CFI+CFF| (proxy for ΔCash): 982000.0


Unnamed: 0,CFO,CFI,CFF,calc_delta_cash
98,-1900000.0,12548000.0,-4801000.0,5847000.0
90,1506000000.0,147000000.0,-828000000.0,825000000.0
304,-665000.0,9881000.0,-1657000.0,7559000.0
100,45612000.0,-19891000.0,-86747000.0,-61026000.0
303,-1514000.0,-858000.0,1365000.0,-1007000.0


### SILVER - Shares Values from Financial Statements in a dataframe

In [2]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer_shares import transform_shares_to_wide

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/silver/shares/year_quarter=2025Q2/shares_wide.parquet")

shares_wide, shares_unknown = transform_shares_to_wide(zip_path, out_path=out_path, return_unknown=True)
shares_wide.shape, shares_wide.head()


((663, 31),
                      adsh      cik                              name    fy  \
 96   0000950170-25-091151  1000045           OLD MARKET CAPITAL CORP  2024   
 88   0000950170-25-083705  1002047                      NETAPP, INC.  2025   
 305  0001437749-25-017409  1004989                  SPAR GROUP, INC.  2024   
 98   0001005229-25-000149  1005229            COLUMBUS MCKINNON CORP  2024   
 304  0001437749-25-014208  1008586  STREAMLINE HEALTH SOLUTIONS INC.  2024   
 
         filed    period   sic  AntidilutiveExcluded  CommonSharesAuthorized  \
 96   20250630  20250331  6153                   NaN              50000000.0   
 88   20250609  20250430  3572                   NaN             885000000.0   
 305  20250516  20241231  7389                   NaN              47000000.0   
 98   20250528  20250331  3531                   NaN              50000000.0   
 304  20250502  20250131  7373                   NaN              85000000.0   
 
      CommonSharesIssued  ... 

In [3]:
sh = shares_wide.copy()

# 1) Shares must be positive
print("Share columns with negative values (should be ~0):")
share_cols = [c for c in sh.columns if c not in {"adsh","cik","name","fy","filed","period","sic"}]
neg_counts = {c: int((sh[c] < 0).sum()) for c in share_cols if sh[c].dtype != "O"}
{k:v for k,v in neg_counts.items() if v > 0}

# 2) Diluted >= Basic (when both exist)
if {"WASO_Basic","WASO_Diluted"}.issubset(set(share_cols)):
    ratio = (sh["WASO_Diluted"] >= sh["WASO_Basic"]).mean()
    print("Diluted >= Basic (% filings):", f"{ratio:.1%}")

# 3) Compare BS outstanding vs WA shares (not equal, but same order of magnitude)
cands = [c for c in share_cols if "Outstanding" in c or "WASO" in c]
sh[cands].describe().T.head(10)

Share columns with negative values (should be ~0):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CommonSharesOutstanding,644.0,2172613000.0,29241560000.0,100.0,7272570.0,27199110.0,100025800.0,550834300000.0
PreferredSharesOutstanding,373.0,5576391.0,66624190.0,0.0,0.0,0.0,100000.0,1248203000.0
UnitsOutstanding,4.0,34538340.0,65009320.0,46280.3,1199815.0,3047164.0,36385690.0,132012800.0
WASOAdjustmentsDiluted,5.0,3527003000.0,7884126000.0,0.0,0.0,0.0,4458169.0,17630550000.0
WASOBasic,547.0,2176484000.0,30910350000.0,4078.0,6900000.0,25899470.0,98131720.0,540817600000.0
WASOCombinedBasicDiluted,14.0,69625150.0,75433150.0,2978540.0,18251810.0,55441300.0,78307840.0,263337500.0
WASODiluted,547.0,2113382000.0,30895190000.0,0.0,7028380.0,27410230.0,100767500.0,540817600000.0
WASOLimitedPartnership,3.0,73362670.0,56670110.0,19147730.0,43941860.0,68736000.0,100470100.0,132204300.0
WASOProFormaDiluted,1.0,7426401000.0,,7426401000.0,7426401000.0,7426401000.0,7426401000.0,7426401000.0
WarrantsRightsOutstanding,1.0,4899497.0,,4899497.0,4899497.0,4899497.0,4899497.0,4899497.0


### SILVER - Transform metadata dataframe

In [6]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer_metadata import transform_metadata_to_wide

zip_path = Path("../data/raw/2025q2.zip")
out_path = Path("../data/silver/meta/year_quarter=2025Q2/meta.parquet")

meta_df = transform_metadata_to_wide(zip_path, out_path=out_path)
print(meta_df.head())
print(len(meta_df))

                     adsh      cik  \
0    0000950170-25-091151  1000045   
541  0001140361-25-016316  1000177   
644  0001140361-25-015768  1001290   
596  0001558370-25-005643  1001807   
133  0000950170-25-083705  1002047   

                                                  name  form    fy  fp  \
0                              OLD MARKET CAPITAL CORP  10-K  2024  FY   
541                        NORDIC AMERICAN TANKERS LTD  20-F  2024  FY   
644                                      CREDICORP LTD  20-F  2024  FY   
596  PERUSAHAAN PERSEROAN PERSERO PT TELEKOMUNIKASI...  20-F  2024  FY   
133                                       NETAPP, INC.  10-K  2025  FY   

       period     filed   sic industry  
0    20250331  20250630  6153  Unknown  
541  20241231  20250429  4400  Unknown  
644  20241231  20250425  6029  Unknown  
596  20241231  20250428  4813  Unknown  
133  20250430  20250609  3572  Unknown  
1218


### GOLD - Merged IS BS CF for one zip

In [3]:
from pathlib import Path
from src.data_prep.gold_builder.builder_per_zip import build_gold_zip
from src.data_prep.gold_builder.builder_all import build_gold_all
import pandas as pd

# 1) Build Gold for your current ZIP
yq = "2025Q2"
silver_dir = Path("../data/silver")
gold_dir = Path("../data/gold")

gold_z_path = build_gold_zip(yq, silver_dir, gold_dir)
print(gold_z_path)

# 2) Build the multi-year panel after you have more zips
panel_path = build_gold_all(gold_dir, gold_dir / "financials_panel.parquet")
panel = pd.read_parquet(panel_path)
panel.shape, panel.head()

../data/gold/2025Q2_financials.parquet


((1218, 87),
                    adsh      cik  \
 0  0000950170-25-091151  1000045   
 1  0001140361-25-016316  1000177   
 2  0001140361-25-015768  1001290   
 3  0001558370-25-005643  1001807   
 4  0000950170-25-083705  1002047   
 
                                                 name    fy     filed  \
 0                            OLD MARKET CAPITAL CORP  2024  20250630   
 1                        NORDIC AMERICAN TANKERS LTD  2024  20250429   
 2                                      CREDICORP LTD  2024  20250425   
 3  PERUSAHAAN PERSEROAN PERSERO PT TELEKOMUNIKASI...  2024  20250428   
 4                                       NETAPP, INC.  2025  20250609   
 
      period   sic industry source_zip  form  ... InterestPaid  \
 0  20250331  6153  Unknown     2025Q2  10-K  ...      47000.0   
 1  20241231  4400  Unknown     2025Q2  20-F  ...          NaN   
 2  20241231  6029  Unknown     2025Q2  20-F  ...          NaN   
 3  20241231  4813  Unknown     2025Q2  20-F  ...          

### Bronze + Silver + Gold - Run the zip files and add to the existing dataframe

In [4]:
from pathlib import Path
from src.data_prep.silver_transformer.transformer import build_everything_for_zip

yqs = ["2025q1","2024q4", "2024q3", "2024q2", "2024q1", "2023q4", "2023q3"]
raw_dir = Path("../data/raw")
silver_dir = Path("../data/silver")
gold_dir = Path("../data/gold")

# for yq in yqs:
#     print(build_everything_for_zip(yq, raw_dir, silver_dir, gold_dir))

# After multiple zips are built:
from src.data_prep.gold_builder.builder_all import build_gold_all
panel_path = build_gold_all(gold_dir, gold_dir / "financials_panel.parquet")
print(panel_path)

../data/gold/financials_panel.parquet


In [5]:
from pathlib import Path
import pandas as pd

# Path to your gold layer file
gold_file = Path("../data/gold/financials_panel.parquet")

# Check if file exists
if not gold_file.exists():
    print(f"❌ File not found: {gold_file}")
else:
    # Read the Parquet file
    df = pd.read_parquet(gold_file)
    
    # Print basic info
    print("✅ Successfully loaded!")
    print(f"📊 Rows: {df.shape[0]:,}")
    print(f"📈 Columns: {df.shape[1]}")
    print("\n🔍 Preview:")
    print(df.head(5))
    
    # Optional — inspect unique companies and fiscal years
    print("\n🏢 Unique companies (CIKs):", df["cik"].nunique())
    print("📆 Fiscal years covered:", sorted(df["fy"].dropna().unique())[:10], "...")


✅ Successfully loaded!
📊 Rows: 13,706
📈 Columns: 112

🔍 Preview:
                   adsh      cik                         name    fy     filed  \
0  0000950170-24-079650  1000045       NICHOLAS FINANCIAL INC  2023  20240701   
1  0000950170-25-091151  1000045      OLD MARKET CAPITAL CORP  2024  20250630   
2  0001140361-24-022947  1000177  NORDIC AMERICAN TANKERS LTD  2023  20240429   
3  0001140361-25-016316  1000177  NORDIC AMERICAN TANKERS LTD  2024  20250429   
4  0001104659-24-029038  1000184                       SAP SE  2023  20240229   

     period   sic industry source_zip  form  ... bs_balanced_flag  \
0  20240331  6153  Unknown     2024q3  10-K  ...            False   
1  20250331  6153  Unknown     2025q2  10-K  ...            False   
2  20231231  4400  Unknown     2024q2  20-F  ...            False   
3  20241231  4400  Unknown     2025q2  20-F  ...            False   
4  20231231  7372  Unknown     2024q1  20-F  ...            False   

   cf_delta_abs  cf_balanced_flag

In [2]:
from pathlib import Path
import pandas as pd

# Load the Gold panel
gold_file = Path("../data/gold/financials_panel.parquet")
df = pd.read_parquet(gold_file)

# Basic shape info
print(f"✅ Loaded {df.shape[0]:,} rows and {df.shape[1]} columns")

# 1️⃣ Total unique companies
unique_ciks = df["cik"].nunique()
print(f"🏢 Total unique CIKs (companies): {unique_ciks:,}")

# 2️⃣ Fiscal years per company
years_per_cik = df.groupby("cik")["fy"].nunique()

# 3️⃣ Companies with at least 2 unique fiscal years (≈ 2 years of 10-Ks)
multi_year_ciks = years_per_cik[years_per_cik >= 2]
print(f"📈 Companies with ≥ 2 fiscal years: {len(multi_year_ciks):,}")

# 4️⃣ Optional — inspect a few examples
print("\n🔍 Sample of multi-year companies:")
print(df[df["cik"].isin(multi_year_ciks.index)].sort_values(["cik", "fy"]).head(10))


✅ Loaded 13,706 rows and 112 columns
🏢 Total unique CIKs (companies): 7,358
📈 Companies with ≥ 2 fiscal years: 6,199

🔍 Sample of multi-year companies:
                   adsh      cik                         name    fy     filed  \
0  0000950170-24-079650  1000045       NICHOLAS FINANCIAL INC  2023  20240701   
1  0000950170-25-091151  1000045      OLD MARKET CAPITAL CORP  2024  20250630   
2  0001140361-24-022947  1000177  NORDIC AMERICAN TANKERS LTD  2023  20240429   
3  0001140361-25-016316  1000177  NORDIC AMERICAN TANKERS LTD  2024  20250429   
4  0001104659-24-029038  1000184                       SAP SE  2023  20240229   
5  0001104659-25-017815  1000184                       SAP SE  2024  20250227   
6  0000950170-24-027942  1000209     MEDALLION FINANCIAL CORP  2023  20240307   
7  0000950170-25-038693  1000209     MEDALLION FINANCIAL CORP  2024  20250313   
8  0001000228-24-000011  1000228             HENRY SCHEIN INC  2023  20240228   
9  0001000228-25-000014  1000228      

In [3]:
df.columns.to_list()

['adsh',
 'cik',
 'name',
 'fy',
 'filed',
 'period',
 'sic',
 'industry',
 'source_zip',
 'form',
 'fp',
 'APIC',
 'AccountsPayable',
 'AccountsReceivable',
 'AccumulatedOCI',
 'CashAndCashEquivalents',
 'CommonStockParValue',
 'CommonStockValue',
 'Goodwill',
 'Intangibles',
 'Inventory',
 'LeaseLiabilityCurrent',
 'LeaseLiabilityNoncurrent',
 'LeaseROUAsset',
 'LongTermDebt',
 'NoncontrollingInterest',
 'OtherCurrentLiabilities',
 'OtherNoncurrentLiabilities',
 'PPandE',
 'PreferredStockParValue',
 'PreferredStockValue',
 'RetainedEarnings',
 'ShareholdersEquity',
 'ShortTermDebt',
 'ShortTermInvestments',
 'TemporaryEquity',
 'TotalAssets',
 'TotalCurrentAssets',
 'TotalCurrentLiabilities',
 'TotalLiabilities',
 'TotalNoncurrentAssets',
 'TreasuryStock',
 'ComprehensiveIncome',
 'CostOfRevenue',
 'DebtExtinguishmentGainLoss',
 'DepAmort',
 'EquityMethodIncome',
 'GrossProfit',
 'Impairment',
 'IncomeTaxExpense',
 'InterestExpense',
 'NetIncome',
 'OperatingExpenses',
 'OperatingInc