In [7]:
import polars as pl

import os
import json

In [8]:
#expected to be a directory with submission, companyfacts, ticker.txt and company_tickers.json all available on SEC website
data_dir = "data"

facts_dir = os.path.join(data_dir, "facts")
stock_data_dir = os.path.join(data_dir, "stock_data")

company_data_dir = os.path.join(data_dir, "company_data")

submissions_dir = os.path.join(data_dir, "submission")
company_facts_dir = os.path.join(data_dir, "companyfacts")
csv_file_path = os.path.join(data_dir, "CIK.csv")

In [24]:
if not os.path.exists(csv_file_path):
    company_tickers_file_path = os.path.join(main_dir, "company_tickers.json")
    
    if os.path.exists(company_tickers_file_path):
        print("company_tickers.json found. Processing...")
        
        with open(company_tickers_file_path, 'r') as json_file:
            json_data = json.load(json_file)
        
        json_records = []
        for entry in json_data.values():
            json_records.append([
                entry.get("ticker"), 
                str(entry.get("cik_str")).zfill(10), 
                entry.get("title", "")
            ])
        
        df_json = pl.DataFrame(json_records, schema=["ticker", "cik_str", "title"], orient="row")
        df_json = df_json.with_columns(pl.col("cik_str").cast(pl.Utf8))
        df_json.write_csv(csv_file_path)
        print("CIK.csv created from company_tickers.json.")
    else:
        raise FileNotFoundError("Error: company_tickers.json not found. Cannot proceed.")
    
    ticker_file_path = os.path.join(main_dir, "ticker.txt")
    
    if os.path.exists(ticker_file_path):
        print("ticker.txt found. Processing and merging...")
        
        txt_data = []
        with open(ticker_file_path, 'r') as txt_file:
            for line in txt_file:
                if line.strip():
                    parts = line.split()
                    if len(parts) == 2:
                        txt_data.append([parts[0], str(parts[1]).zfill(10), ""])
        
        df_txt = pl.DataFrame(txt_data, schema=["ticker", "cik_str", "title"], orient="row")
        df_txt = df_txt.with_columns(pl.col("cik_str").cast(pl.Utf8))  
        df_existing = pl.read_csv(csv_file_path).with_columns(pl.col("cik_str").cast(pl.Utf8))
        
        df_combined = pl.concat([df_existing, df_txt])
        df_combined = df_combined.sort(["ticker", "title"], descending=[False, True]).unique(subset=["ticker", "cik_str"], keep="first")
        
        # Save back to CSV
        df_combined
        df_combined.write_csv(csv_file_path)
        print("ticker.txt data merged successfully into CIK.csv!")
    else:
        print("ticker.txt not found. Skipping TXT processing.")
else:
    print("cik.csv found.")



cik.csv found.


In [25]:
cik_df = pl.read_csv(csv_file_path).with_columns(
    pl.col("cik_str").cast(pl.Utf8).str.zfill(10)
)

print(f"Total df shape: {cik_df.shape}")
print(cik_df[:20])

Total df shape: (22122, 3)
shape: (20, 3)
┌────────┬────────────┬─────────────────────────────────┐
│ ticker ┆ cik_str    ┆ title                           │
│ ---    ┆ ---        ┆ ---                             │
│ str    ┆ str        ┆ str                             │
╞════════╪════════════╪═════════════════════════════════╡
│ FBK    ┆ 0001649749 ┆ FB Financial Corp               │
│ GD     ┆ 0000040533 ┆ GENERAL DYNAMICS CORP           │
│ aem    ┆ 0000002809 ┆                                 │
│ PMFAX  ┆ 0001723701 ┆ PIMCO Flexible Municipal Incom… │
│ bgry   ┆ 0001824734 ┆                                 │
│ …      ┆ …          ┆ …                               │
│ kkoyf  ┆ 0001436794 ┆                                 │
│ BRAG   ┆ 0001867834 ┆ Bragg Gaming Group Inc.         │
│ TOI    ┆ 0001799191 ┆ Oncology Institute, Inc.        │
│ fix    ┆ 0001035983 ┆                                 │
│ BDCZ   ┆ 0001114446 ┆ UBS AG                          │
└────────┴────────────┴───────