In [None]:
import os
import pandas as pd

base_url = "  "
idx_dir = "data/edgar"  # Replace with your directory
filings_list = []

for year in range(1994, 2026):
    for qtr in ["QTR1", "QTR2", "QTR3", "QTR4"]:
        idx_file = f"{idx_dir}/{year}/{qtr}/master.idx"
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                lines = f.readlines()[11:]  # Skip header
            for line in lines:
                parts = line.strip().split('|')
                if len(parts) >= 5 and parts[2] in ['10-K']:
                    filings_list.append({
                        'CIK': parts[0],
                        'Company': parts[1],
                        'Form': parts[2],
                        'Date': parts[3],
                        'URL': base_url + parts[4]
                    })

filings_df = pd.DataFrame(filings_list)
filings_df.to_csv('data/filings_list.csv', index=False)

In [None]:
import requests
import time
import os
from tqdm import tqdm

filings_df = pd.read_csv('data/filings_list.csv')
download_dir = "data/edgar/filings"
headers = {'User-Agent': 'WilliamFrank william_dieter@hotmail.com'}

# Assuming filings_df is defined elsewhere
total_files = len(filings_df)

# Wrap iterrows() with tqdm for progress bar
with tqdm(filings_df.iterrows(), total=total_files, desc="Downloading Filings") as pbar:
    for index, row in pbar:
        cik, form, date, url = row['CIK'], row['Form'], row['Date'], row['URL']
        year = date[:4]
        save_dir = f"{download_dir}/{cik}/{year}"
        os.makedirs(save_dir, exist_ok=True)
        file_path = f"{save_dir}/{form}_{date}.txt"
        
        # Check if file already exists and is not empty
        if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
            # Only print skips every 100 files to reduce spam
            if index % 100 == 0:
                pbar.write(f"Skipped {file_path} - already exists")
            continue
        
        # Download the file
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
            # Update tqdm description with current file (optional, less spammy)
            pbar.set_description(f"Downloading {form}_{date}") 
        else:
            # Always print failures for debugging
            pbar.write(f"Failed to download {url} - Status: {response.status_code}")
        
        time.sleep(0.1)  # Rate limit delay

Downloading 10-K_1995-03-27:   1%|          | 2465/228416 [32:46<68:02:47,  1.08s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/66904/0000092122-95-000038.txt - Status: 503


Downloading 10-K_1996-12-23:   3%|▎         | 7797/228416 [1:54:18<59:23:23,  1.03it/s] 

Failed to download https://www.sec.gov/Archives/edgar/data/1005697/0000912057-96-030332.txt - Status: 503


Downloading 10-K_1997-03-31:   5%|▍         | 11098/228416 [2:45:33<51:21:29,  1.18it/s] 

Skipped data/edgar/filings/839947/1997/10-K_1997-03-31.txt - already exists


Downloading 10-K_1998-03-30:   8%|▊         | 17798/228416 [4:27:44<118:06:57,  2.02s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/810830/0000810830-98-000001.txt - Status: 503


Downloading 10-K_1998-02-13:   8%|▊         | 18157/228416 [4:33:14<464:01:04,  7.94s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/846930/0000950132-98-000259.txt - Status: 503


Downloading 10-K_1998-03-11:   8%|▊         | 18330/228416 [4:36:15<591:53:12, 10.14s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/864601/0000892569-98-000878.txt - Status: 503


Downloading 10-K_1998-03-20:   8%|▊         | 19338/228416 [4:51:00<214:21:02,  3.69s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/929900/0000950005-98-000343.txt - Status: 503


Downloading 10-K_1998-03-31:   8%|▊         | 19385/228416 [4:52:04<449:04:39,  7.73s/it]

Failed to download https://www.sec.gov/Archives/edgar/data/933590/0000933590-98-000002.txt - Status: 503


Downloading 10-K_2002-04-01:  20%|██        | 46299/228416 [11:09:17<42:35:04,  1.19it/s]

Skipped data/edgar/filings/81350/2002/10-K_2002-04-01.txt - already exists


Downloading 10-K_2007-03-28:  39%|███▊      | 87990/228416 [21:05:19<32:08:09,  1.21it/s] 

Skipped data/edgar/filings/811785/2007/10-K_2007-03-28.txt - already exists


Downloading 10-K_2008-03-31:  41%|████▏     | 94397/228416 [22:38:37<30:56:56,  1.20it/s]

Skipped data/edgar/filings/1347185/2008/10-K_2008-03-31.txt - already exists


Downloading 10-K_2008-03-31:  43%|████▎     | 97200/228416 [23:18:18<30:37:05,  1.19it/s]

Skipped data/edgar/filings/819975/2008/10-K_2008-03-31.txt - already exists


Downloading 10-K_2009-03-31:  45%|████▌     | 102930/228416 [24:45:23<30:34:24,  1.14it/s]

In [None]:
import os
import json
import pandas as pd

# Define file paths (adjust as needed)
project_root = os.getcwd()
json_file = os.path.join(project_root, "data", "edgar", "company_tickers_exchange.json")
txt_file = os.path.join(project_root, "data", "edgar", "ticker.txt")
output_file = os.path.join(project_root, "data", "cik_ticker_mapping.csv")

# Step 1: Load your CIK list (example assumes from a directory; adjust as needed)
filings_dir = os.path.join(project_root, "data", "edgar", "filings")
cik_folders = [folder for folder in os.listdir(filings_dir) 
               if os.path.isdir(os.path.join(filings_dir, folder))]
cik_list = [f"{int(cik):010d}" for cik in cik_folders]  # Standardize to 10-digit strings
my_ciks_df = pd.DataFrame({"cik": cik_list})

# Step 2: Load JSON file
with open(json_file, 'r') as f:
    json_data = json.load(f)

# Step 3: Inspect JSON structure
tickers_list = json_data.get("data", [])
if not tickers_list:
    raise ValueError("No 'data' key found in JSON file or data is empty.")
print("First entry in JSON data:", tickers_list[0])  # Inspect the first entry

# Step 4: Convert JSON data to DataFrame
json_df = pd.DataFrame(tickers_list, columns=["cik", "name", "ticker", "exchange"])

# Standardize CIK to 10-digit strings
json_df["cik"] = json_df["cik"].apply(lambda x: f"{int(x):010d}" if isinstance(x, int) else x.zfill(10))

# Step 5: Load TXT file for fallback
txt_df = pd.read_csv(txt_file, sep="\t", header=None, names=["ticker", "cik"], dtype=str)
txt_df["cik"] = txt_df["cik"].apply(lambda x: x.zfill(10))

# Step 6: Merge with JSON data (primary source)
merged_json = my_ciks_df.merge(json_df, on="cik", how="left")

# Step 7: Handle CIKs not found in JSON
not_found_ciks = merged_json[merged_json["ticker"].isna()]["cik"].unique()
not_found_df = pd.DataFrame({"cik": not_found_ciks})
merged_txt = not_found_df.merge(txt_df, on="cik", how="left")
merged_txt["name"] = None  # No name available from TXT
merged_txt["exchange"] = None  # No exchange available from TXT

# Step 8: Combine results
found = merged_json.dropna(subset=["ticker"])
final_df = pd.concat([found, merged_txt], ignore_index=True)

# Step 9: Fill missing tickers
final_df["ticker"] = final_df["ticker"].fillna("Not Found")

# Step 10: Order columns
final_df = final_df[["cik", "name", "ticker", "exchange"]]

# Step 11: Save to CSV
final_df.to_csv(output_file, index=False)
print(f"Saved mapping for {len(final_df)} rows to {output_file}")

# Preview the result
print("\nPreview of the mapping:")
print(final_df.head())