## Data Cleaning and merging tickers, OHLC data

In [1]:
import sys
from pathlib import Path

root_dir = Path.cwd().parent  # Go one folder up from /notebooks
print("Root directory:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

Root directory: /Users/agalyaayyadurai/Automated-Equity-Valuation


In [2]:
import pandas as pd

# Loading the financial statements of all the companies
file_path = Path("../data/gold/financials_panel.parquet")
df_raw = pd.read_parquet(file_path)

In [3]:
from src.data_prep.prep_config import PREP_CONFIG
from src.data_prep.data_prep import process_fundamentals

# Run the full cleaning pipeline
df_clean, scalers = process_fundamentals(df_raw, config=PREP_CONFIG, verbose=True)


df_clean.to_parquet("../data/financials_clean.parquet", compression="gzip", index=False)

import pickle
pickle.dump(scalers, open("prep_scalers.pkl", "wb"))

# 3. Save scalers to reuse for future quarters (optional)
# e.g. with pickle or json

[WARN] Missing expected columns: ['instance']
[INFO] Extra columns present (will be ignored by pipeline): ['ADSOutstanding', 'APIC', 'AccumulatedOCI', 'AcquireIntangibles', 'AntidilutiveExcluded', 'BusinessAcquisitions', 'CommonSharesAuthorized', 'CommonSharesIssued', 'CommonStockParValue', 'CommonStockValue', 'ComprehensiveIncome', 'DebtExtinguishmentGainLoss', 'DepAmort', 'EquityMethodIncome', 'FXEffect', 'Impairment', 'LeaseLiabilityCurrent', 'LeaseLiabilityNoncurrent', 'LeaseROUAsset', 'NoncontrollingInterest', 'OtherIncomeExpense', 'OtherNoncurrentLiabilities', 'PreferredSharesAuthorized', 'PreferredSharesDesignated', 'PreferredSharesIssued', 'PreferredSharesOutstanding', 'PreferredStockParValue', 'PreferredStockValue', 'PretaxIncome', 'ROUAssetNoncash', 'SharesDesignated', 'SharesIssuable', 'SharesReservedForFutureIssuance', 'ShortTermInvestments', 'TemporaryEquity', 'TemporaryEquityShares', 'TotalNoncurrentAssets', 'TreasuryShares', 'TreasuryStock', 'UnitsIssued', 'UnitsOutstand

In [5]:
# Merging hte ticker for each CIK

from src.data_prep.cik_ticker import load_sec_cik_ticker_exchange, infer_cik_tickers_from_fsds_zips
from src.data_extract.bronze_extractor.fsds_loader import load_fsds_from_zip   # adjust this import

# 1. Load SEC mapping (already did this earlier)
sec_map = load_sec_cik_ticker_exchange("../data/company_tickers_exchange.json")

# 2. Infer tickers from instance prefixes in all FSDS zips
zips_root = "../data/raw"  

cik_inf = infer_cik_tickers_from_fsds_zips(
    zips_root=zips_root,
    load_fsds_from_zip=load_fsds_from_zip,
)

print(cik_inf.head())
print("[INFO] Inferred CIKs:", len(cik_inf))


[INFO] Processing zip: ../data/raw/2025q2.zip
[INFO] Processing zip: ../data/raw/2023q4.zip
[INFO] Processing zip: ../data/raw/2018q2.zip
[INFO] Processing zip: ../data/raw/2018q3.zip
[INFO] Processing zip: ../data/raw/2025q1.zip
[INFO] Processing zip: ../data/raw/2018q1.zip
[INFO] Processing zip: ../data/raw/2021q4.zip
[INFO] Processing zip: ../data/raw/2023q2.zip
[INFO] Processing zip: ../data/raw/2018q4.zip
[INFO] Processing zip: ../data/raw/2021q1.zip
[INFO] Processing zip: ../data/raw/2023q3.zip
[INFO] Processing zip: ../data/raw/2023q1.zip
[INFO] Processing zip: ../data/raw/2021q3.zip
[INFO] Processing zip: ../data/raw/2021q2.zip
[INFO] Processing zip: ../data/raw/2019q1.zip
[INFO] Processing zip: ../data/raw/2024q1.zip
[INFO] Processing zip: ../data/raw/2020q4.zip
[INFO] Processing zip: ../data/raw/2022q4.zip
[INFO] Processing zip: ../data/raw/2019q2.zip
[INFO] Processing zip: ../data/raw/2024q2.zip
[INFO] Processing zip: ../data/raw/2024q3.zip
[INFO] Processing zip: ../data/raw

In [6]:
# 1. Merge SEC mapping
df_clean["cik"] = pd.to_numeric(df_clean["cik"], errors="coerce").astype("Int64")
sec_map["cik"] = pd.to_numeric(sec_map["cik"], errors="coerce").astype("Int64")

df_with_ticker = df_clean.merge(
    sec_map[["cik", "ticker_sec"]],
    on="cik",
    how="left"
)
df_with_ticker = df_with_ticker.rename(columns={"ticker_sec": "ticker"})

# 2. Merge instance-inferred mapping
cik_inf["cik"] = pd.to_numeric(cik_inf["cik"], errors="coerce").astype("Int64")

df_with_ticker = df_with_ticker.merge(
    cik_inf[["cik", "ticker_inferred"]],
    on="cik",
    how="left"
)

# 3. For rows where SEC ticker is missing, fill from inferred
df_with_ticker["ticker"] = df_with_ticker["ticker"].fillna(df_with_ticker["ticker_inferred"])
df_with_ticker = df_with_ticker.drop(columns=["ticker_inferred"])

# 4. Check coverage
coverage = df_with_ticker["ticker"].notna().mean() * 100
print(f"[INFO] Combined SEC + instance-inferred match rate: {coverage:.2f}%")

df_with_ticker[["cik", "name", "fy", "period", "ticker"]].head()

[INFO] Combined SEC + instance-inferred match rate: 97.94%


Unnamed: 0,cik,name,fy,period,ticker
0,38074,FOREST LABORATORIES INC,2008,2009-03-31,FRX
1,1002638,OPEN TEXT CORP,2009,2009-06-30,OTEX
2,833444,TYCO INTERNATIONAL LTD /BER/,2009,2009-09-30,JCI
3,1385157,TYCO ELECTRONICS LTD.,2009,2009-09-30,TEL
4,1024478,ROCKWELL AUTOMATION INC,2009,2009-09-30,ROK


In [7]:
missing_count = df_with_ticker["ticker"].isna() | (df_with_ticker["ticker"] == "None")
missing_count.sum()

1688

## Prep the data with ticker

In [9]:
df_with_ticker.to_parquet("../data/gold/financials_ticker_panel.parquet")