In [2]:
import zipfile

import pandas as pd

zip_path = "/Users/agalyaayyadurai/Documents/Dissertation/Automated-Equity-Valuation/data/raw/2025q2.zip"
folder_path = "/Users/agalyaayyadurai/Documents/Dissertation/Automated-Equity-Valuation/data/raw/"

with zipfile.ZipFile(zip_path, "r") as z:
    for name in z.namelist():
        if name.lower().endswith("sub.txt"):
            z.extract(name, path=folder_path)
            print("Extracted:", folder_path + name)

Extracted: /Users/agalyaayyadurai/Documents/Dissertation/Automated-Equity-Valuation/data/raw/sub.txt


In [12]:
# Reading the sub.txt file with pandas
sub = pd.read_csv(folder_path+"sub.txt", sep = "\t", na_values="\\N", dtype = str, low_memory = False)

# Filtering out companies that file 10-k in 2025Q2
annuals = sub[sub["form"] == "10-K"]
print("Number of 10-K filings:", len(annuals))
print(annuals[["adsh","cik","name","fy","period","filed"]].head(10))

Number of 10-K filings: 624
                    adsh      cik                             name    fy  \
30  0001213900-25-032210  1860484      RELATIVITY ACQUISITION CORP  2024   
37  0001683168-25-002614  1593184                  BIOREGENX, INC.  2024   
47  0001170010-25-000024  1170010                       CARMAX INC  2024   
58  0001096906-25-000593  1848672                 GLIDELOGIC CORP.  2024   
66  0001213900-25-047877  1865506                 ZEO ENERGY CORP.  2024   
67  0000313143-25-000025   313143                 HAEMONETICS CORP  2024   
68  0001477932-25-002710  1132509               NEW MOMENTUM CORP.  2024   
69  0001641172-25-017134  1053369   ELITE PHARMACEUTICALS INC /NV/  2024   
70  0001641172-25-002917  1847345  ASPIRE BIOPHARMA HOLDINGS, INC.  2024   
78  0000353184-25-000044   353184                        AIR T INC  2024   

      period     filed  
30  20241231  20250415  
37  20241231  20250415  
47  20250228  20250411  
58  20250131  20250425  
66  202412

In [15]:
# Exploring the presentation map - pre.txt

pre = pd.read_csv(folder_path+"pre.txt", sep="\t", na_values="\\N", dtype = str, low_memory=False)
print(pre.head())

                   adsh report line stmt inpth rfile  \
0  0000002488-25-000047      2    1   IS     0     H   
1  0000002488-25-000047      2    2   IS     0     H   
2  0000002488-25-000047      2    3   IS     0     H   
3  0000002488-25-000047      2    4   IS     0     H   
4  0000002488-25-000047      2    5   IS     0     H   

                                                 tag               version  \
0  RevenueFromContractWithCustomerExcludingAssess...          us-gaap/2025   
1  CostOfGoodsAndServiceExcludingDepreciationDepl...          us-gaap/2025   
2    AmortizationOfAcquisitionRelatedIntangiblesCOGS  0000002488-25-000047   
3                         CostOfGoodsAndServicesSold          us-gaap/2025   
4                                        GrossProfit          us-gaap/2025   

                                            plabel negating  
0                                      Net revenue        0  
1                                    Cost of sales        0  
2  Amort

In [18]:
#Filter pre records for a chosen filing eg. adsh="0001213900-25-032210"

adsh_id = "0001213900-25-032210"
pre_chosen = pre[pre['adsh'] == adsh_id]

# Viewing the distinct statement types
print(pre_chosen['stmt'].unique())

# Balance Sheet - Rows related to BS for the chosen filing
bs = pre_chosen[pre_chosen["stmt"].str.upper().str.startswith("BS")]
print(len(bs))
print(bs[["report", "line", "stmt", "tag"]].head(35))

['BS' 'IS' 'EQ' 'CF']
34
       report line stmt                                                tag
286779      3   11   BS  CashCashEquivalentsRestrictedCashAndRestricted...
286780      3   12   BS                              PrepaidExpenseCurrent
286781      3   13   BS                         OtherReceivablesNetCurrent
286782      3   14   BS                                      AssetsCurrent
286783      3   15   BS                        AssetsHeldInTrustNoncurrent
286784      3   16   BS                                             Assets
286785      3   18   BS                            OtherLiabilitiesCurrent
286786      3   19   BS                          AccruedLiabilitiesCurrent
286787      3   20   BS       SalesAndExciseTaxPayableCurrentAndNoncurrent
286788      3   21   BS                                TaxesPayableCurrent
286789      3   22   BS                                NotesPayableCurrent
286790      3   23   BS                         FranchiseTaxPayableCurrent


In [21]:
# Extract num.txt

num = pd.read_csv(folder_path+"num.txt", sep = "\t", na_values="\\N", dtype = str, low_memory=False)
print(num.head())

                   adsh                                                tag  \
0  0000002488-25-000047                             AccountsPayableCurrent   
1  0000002488-25-000047                                      AssetsCurrent   
2  0000002488-25-000047  CashAndCashEquivalentsAtCarryingValueIncluding...   
3  0000002488-25-000047                        CommitmentsAndContingencies   
4  0000002488-25-000047                            CommonStockSharesIssued   

        version     ddate qtrs     uom segments coreg             value  \
0  us-gaap/2025  20250331    0     USD      NaN   NaN   2206000000.0000   
1  us-gaap/2025  20250331    0     USD      NaN   NaN  21595000000.0000   
2  us-gaap/2025  20240331    0     USD      NaN   NaN   4190000000.0000   
3  us-gaap/2025  20241231    0     USD      NaN   NaN               NaN   
4  us-gaap/2025  20250331    0  shares      NaN   NaN   1681000000.0000   

  footnote  
0      NaN  
1      NaN  
2      NaN  
3      NaN  
4      NaN  


In [25]:
# Filtering the chosen company

num_chosen = num[num['adsh'] == adsh_id]
bs_tags = bs['tag'].unique().tolist()
bs_num_chosen = num_chosen[num_chosen['tag'].isin(bs_tags)]

print(bs_num_chosen[["adsh", "tag", "ddate", "qtrs", "uom", "value"]].head(5))
print(len(bs_num_chosen))


                         adsh                             tag     ddate qtrs  \
1034692  0001213900-25-032210                          Assets  20241231    0   
1034693  0001213900-25-032210     AssetsHeldInTrustNoncurrent  20231231    0   
1034694  0001213900-25-032210     AssetsHeldInTrustNoncurrent  20231231    0   
1034695  0001213900-25-032210    CommonStockSharesOutstanding  20230228    0   
1034703  0001213900-25-032210  PreferredStockSharesAuthorized  20241231    0   

            uom         value  
1034692     USD   803544.0000  
1034693     USD  1746543.0000  
1034694     USD  1746543.0000  
1034695  shares  4400794.0000  
1034703  shares  1000000.0000  
146


## For the chosen filing, extracting the Balance Sheet in one row

In [33]:
import pandas as pd
from pathlib import Path

root = Path("/Users/agalyaayyadurai/Documents/Dissertation/Automated-Equity-Valuation/data/sec/2025q2")
sub = pd.read_csv(root / "sub.txt", sep="\t", na_values="\\N", dtype=str, low_memory=False)
pre = pd.read_csv(root / "pre.txt", sep="\t", na_values="\\N", dtype=str, low_memory=False)
num = pd.read_csv(root / "num.txt", sep="\t", na_values="\\N", dtype=str, low_memory=False)

In [43]:
adsh_id = "0001213900-25-032210"

# current period for the latest BS filing from sub
period = sub.loc[sub["adsh"] == adsh_id, "period"].iloc[0]

# balance sheet tags from pre
bs_tags = pre.loc[(pre["adsh"] == adsh_id) & (pre["stmt"].str.upper().str.startswith("BS")), "tag"].unique()

# extracting numeric facts for the tags
num_bs = num[(num["adsh"] == adsh_id) & (num["tag"].isin(bs_tags))].copy()

# keepign instant facts and current ddate
num_bs = num_bs[(num_bs["qtrs"] == "0") & (num_bs["ddate"] == period)].copy()

# fallback when period is not available
if num_bs.empty:
    latest_ddate = num[(num["adsh"] == adsh_id) & (num["qtrs"] == 0)]["ddate"].max()
    num_bs = num[(num["adsh"] == adsh_id) & (num["tag"].isin(bs_tags)) & (num["qtrs"] == 0) & (num["ddate"] == latest_ddate)].copy()
    
# keeping monetary usd rows only (no shares yet)
num_bs_money = num_bs[num_bs["uom"].str.lower() == "usd"].copy()

# dropping duplicate tags (foolproof)
num_bs_money = num_bs_money.drop_duplicates(subset=["tag"], keep = "first")

# one row with tags as columns
wide_bs = num_bs_money.pivot_table(index=["adsh", "ddate"], columns = "tag", values = "value", aggfunc = "first").reset_index()

# making values numeric
for col in wide_bs.columns:
    if col not in ["adsh", "ddate"]:
        wide_bs[col] = pd.to_numeric(wide_bs[col], errors = "coerce")

# attaching company metadata
meta_cols = [c for c in ["adsh","cik","name","fy","filed"] if c in sub.columns]
wide_bs = wide_bs.merge(sub.loc[sub["adsh"] == adsh_id, meta_cols].drop_duplicates(), on = "adsh", how = "left")

shares = num_bs[(num_bs["qtrs"] == "0") & (num_bs["uom"].str.lower() == "shares")].copy()
shares_cur = shares[shares["ddate"] == period]
shares_wide = shares_cur.pivot_table(index=["adsh","ddate"], columns="tag", values="value", aggfunc="first").reset_index()
# merge into the monetary wide table:
wide_bs = wide_bs.merge(shares_wide, on=["adsh","ddate"], how="left", suffixes=("", "_shares"))

print(wide_bs.shape)
print(wide_bs)

(1, 34)
                   adsh     ddate  AccruedLiabilitiesCurrent    Assets  \
0  0001213900-25-032210  20241231                  1998193.0  803544.0   

   AssetsCurrent  AssetsHeldInTrustNoncurrent  \
0        34277.0                     769267.0   

   CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents  \
0                                             1674.0               

   CommonStockParOrStatedValuePerShare  CommonStockValue  \
0                               0.0001             424.0   

   FranchiseTaxPayableCurrent  ...      cik                         name  \
0                      8600.0  ...  1860484  RELATIVITY ACQUISITION CORP   

     fy     filed  CommonStockSharesAuthorized  CommonStockSharesIssued  \
0  2024  20250415               100000000.0000             4247499.0000   

   CommonStockSharesOutstanding  PreferredStockSharesAuthorized  \
0                  4247499.0000                    1000000.0000   

   TemporaryEquitySharesAuthorized  TemporaryEq

## Extracting IS for the chosen filing

In [45]:
# 2️⃣ all tags that belong to the Income Statement (stmt starts with "IS")
is_tags = pre.loc[
    (pre["adsh"] == adsh_id) & (pre["stmt"].str.upper().str.startswith("IS")),
    "tag"
].unique()

# 3️⃣ filter num.txt to this filing + these tags
num_is = num[(num["adsh"] == adsh_id) & (num["tag"].isin(is_tags))].copy()

# 4️⃣ keep "duration" rows (qtrs == "4") and matching period
num_is = num_is[(num_is["qtrs"] == "4") & (num_is["ddate"] == period) & (num_is["uom"].str.lower() == "usd")].copy()

# (fallback: if no qtrs==4 exists, pick the latest date available for duration)
if num_is.empty:
    latest_ddate = num[(num["adsh"] == adsh_id) & (num["qtrs"] == "4")]["ddate"].max()
    num_is = num[(num["adsh"] == adsh_id) & (num["tag"].isin(is_tags)) & (num["qtrs"] == "4") & (num["ddate"] == latest_ddate)].copy()

# 5️⃣ remove duplicates per tag
num_is = num_is.drop_duplicates(subset=["tag"], keep="first")

# 6️⃣ pivot to wide format
wide_is = num_is.pivot_table(index=["adsh","ddate"], columns="tag", values="value", aggfunc="first").reset_index()

# 7️⃣ convert numeric columns
for col in wide_is.columns:
    if col not in ["adsh","ddate"]:
        wide_is[col] = pd.to_numeric(wide_is[col], errors="coerce")

# 8️⃣ add metadata
meta_cols = [c for c in ["adsh","cik","name","fy","filed"] if c in sub.columns]
wide_is = wide_is.merge(
    sub.loc[sub["adsh"] == adsh_id, meta_cols].drop_duplicates(),
    on="adsh",
    how="left"
)

print(wide_is.shape)
print(wide_is)  # see first 20 columns

(1, 16)
                   adsh     ddate  EarningsPerShareBasic  \
0  0001213900-25-032210  20241231                   -0.1   

   EarningsPerShareDiluted  FairValueAdjustmentOfWarrants  \
0                     -0.1                        15780.0   

   GainsLossesOnExtinguishmentOfDebt  GeneralAndAdministrativeExpense  \
0                           360114.0                         741798.0   

   IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest  \
0                                          -358581.0                                             

   IncomeTaxExpenseBenefit  InvestmentIncomeInterest  \
0                  81983.0                   38883.0   

   NonoperatingIncomeExpense  OperatingIncomeLoss      cik  \
0                   383217.0            -741798.0  1860484   

                          name    fy     filed  
0  RELATIVITY ACQUISITION CORP  2024  20250415  


## Extracting CF for the chosen filing

In [46]:
# 2️⃣ all tags belonging to the Cash Flow Statement
cf_tags = pre.loc[
    (pre["adsh"] == adsh_id) & (pre["stmt"].str.upper().str.startswith("CF")),
    "tag"
].unique()

# 3️⃣ filter num.txt to this filing + these tags
num_cf = num[(num["adsh"] == adsh_id) & (num["tag"].isin(cf_tags))].copy()

# 4️⃣ keep "duration" rows (qtrs == "4") and matching period
num_cf = num_cf[
    (num_cf["qtrs"] == "4") &
    (num_cf["ddate"] == period) &
    (num_cf["uom"].str.lower() == "usd")
].copy()

# fallback: use latest available if none match
if num_cf.empty:
    latest_ddate = num[(num["adsh"] == adsh_id) & (num["qtrs"] == "4")]["ddate"].max()
    num_cf = num[
        (num["adsh"] == adsh_id) &
        (num["tag"].isin(cf_tags)) &
        (num["qtrs"] == "4") &
        (num["ddate"] == latest_ddate)
    ].copy()

# 5️⃣ remove duplicates per tag
num_cf = num_cf.drop_duplicates(subset=["tag"], keep="first")

# 6️⃣ pivot to wide
wide_cf = num_cf.pivot_table(
    index=["adsh","ddate"],
    columns="tag",
    values="value",
    aggfunc="first"
).reset_index()

# 7️⃣ convert numeric columns
for col in wide_cf.columns:
    if col not in ["adsh","ddate"]:
        wide_cf[col] = pd.to_numeric(wide_cf[col], errors="coerce")

# 8️⃣ merge metadata
meta_cols = [c for c in ["adsh","cik","name","fy","filed"] if c in sub.columns]
wide_cf = wide_cf.merge(
    sub.loc[sub["adsh"] == adsh_id, meta_cols].drop_duplicates(),
    on="adsh",
    how="left"
)

print(wide_cf.shape)
print(wide_cf)

(1, 25)
                   adsh     ddate  \
0  0001213900-25-032210  20241231   

   CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect  \
0                                            -5457.0                                                                

   CashWithdrawnFromTrustAccountInConnectionWithRedemption  \
0                                         -1019230.0         

   FairValueAdjustmentOfWarrants  GainsLossesOnExtinguishmentOfDebt  \
0                        15780.0                           360114.0   

   IncomeTaxesPaid  IncreaseDecreaseInAccruedIncomeTaxesPayable  \
0          24970.0                                      76953.0   

   IncreaseDecreaseInAccruedLiabilities  \
0                              402910.0   

   IncreaseDecreaseInDueToRelatedParties  ...  \
0                                27882.0  ...   

   NetCashProvidedByUsedInInvestingActivities  \
0                                  -1022301.