In [None]:
import os
import re
from datetime import datetime
from bs4 import BeautifulSoup
from psycopg2 import pool, extras
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import psycopg2

# Configuration: Set to True to process from the bottom (latest years first)
PROCESS_FROM_BOTTOM = True  # Set to False on the other computer

# Database connection parameters
db_params = {
    'dbname': 'StockSelector',
    'user': 'postgres',
    'password': 'postgres',
    'host': 'localhost',  # Update to server IP if running on different machines
    'port': '5432'
}

# Create a threaded connection pool
conn_pool = pool.ThreadedConnectionPool(1, 20, **db_params)

# Precompile the regex pattern for us-gaap tags
us_gaap_regex = re.compile(r'us-gaap:.*', re.I)

# Path to filings folder
filings_folder = r"C:\Users\willi\Documents\GitHub\Stock-Selector\data\edgar\filings"

def get_fiscal_year(period_element):
    date_formats = ['%Y-%m-%d', '%m/%d/%Y', '%d-%b-%Y']
    for tag in ['instant', 'endDate', 'startDate']:
        date_tag = period_element.find(tag)
        if date_tag:
            date_str = date_tag.text.strip()
            for fmt in date_formats:
                try:
                    return datetime.strptime(date_str, fmt).year
                except ValueError:
                    continue
    return None

def process_file(args):
    cik, year_folder, file_path = args
    conn = None
    try:
        conn = conn_pool.getconn()
        with conn.cursor() as cursor:
            # Skip file if already processed
            cursor.execute("SELECT 1 FROM processed_files WHERE file_path = %s", (file_path,))
            if cursor.fetchone():
                return None

            # Read the file
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                return None

            # Quick check for XBRL content
            if '<context' not in content.lower():
                cursor.execute("INSERT INTO processed_files (file_path) VALUES (%s)", (file_path,))
                conn.commit()
                return None

            soup = BeautifulSoup(content, 'lxml')
            context_elements = soup.find_all('context')
            if not context_elements:
                cursor.execute("INSERT INTO processed_files (file_path) VALUES (%s)", (file_path,))
                conn.commit()
                return None

            # Build a mapping of context id to fiscal year for eligible periods
            context_to_year = {}
            for context in context_elements:
                context_id = context.get('id')
                period = context.find('period')
                if period:
                    fiscal_year = get_fiscal_year(period)
                    if fiscal_year and fiscal_year >= 2009:
                        context_to_year[context_id] = fiscal_year

            if not context_to_year:
                cursor.execute("INSERT INTO processed_files (file_path) VALUES (%s)", (file_path,))
                conn.commit()
                return None

            # Collect fiscal years in the file
            file_fiscal_years = set(context_to_year.values())

            # Check if all fiscal years in the file are already in the database for this CIK
            cursor.execute("SELECT DISTINCT fiscal_year FROM financials WHERE cik = %s", (cik,))
            existing_fy = set(row[0] for row in cursor.fetchall())
            if all(fy in existing_fy for fy in file_fiscal_years):
                cursor.execute("INSERT INTO processed_files (file_path) VALUES (%s)", (file_path,))
                conn.commit()
                return None

            # Extract metrics using the precompiled regex
            metrics = []
            for element in soup.find_all(us_gaap_regex):
                try:
                    value = float(element.text.strip())
                except ValueError:
                    continue
                context_ref = element.get('contextref')
                if context_ref in context_to_year:
                    fiscal_year = context_to_year[context_ref]
                    tag = element.name.lower()
                    metrics.append((cik, fiscal_year, tag, value))

            # Insert metrics if any were found
            if metrics:
                extras.execute_batch(cursor,
                    """
                    INSERT INTO financials (cik, fiscal_year, metric, value)
                    VALUES (%s, %s, %s, %s)
                    ON CONFLICT DO NOTHING
                    """,
                    metrics
                )
            # Mark the file as processed
            cursor.execute("INSERT INTO processed_files (file_path) VALUES (%s)", (file_path,))
            conn.commit()
            return len(metrics)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None
    finally:
        if conn:
            conn_pool.putconn(conn)

def create_tables():
    conn = None
    try:
        conn = conn_pool.getconn()
        with conn.cursor() as cursor:
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS financials (
                    cik TEXT,
                    fiscal_year INTEGER,
                    metric TEXT,
                    value NUMERIC,
                    PRIMARY KEY (cik, fiscal_year, metric)
                );
                CREATE TABLE IF NOT EXISTS processed_files (
                    file_path TEXT PRIMARY KEY,
                    processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
            """)
            conn.commit()
    finally:
        if conn:
            conn_pool.putconn(conn)

# Create tables
create_tables()

# Build the list of files to process
file_list = []
for cik_folder in os.listdir(filings_folder):
    cik_path = os.path.join(filings_folder, cik_folder)
    if not os.path.isdir(cik_path):
        continue
    cik = cik_folder
    for year_folder in os.listdir(cik_path):
        try:
            year_int = int(year_folder)
        except ValueError:
            continue
        # Only process filings from 2009 to 2015 (inclusive)
        if year_int < 2009 or year_int > 2015:
            continue
        year_path = os.path.join(cik_path, year_folder)
        if not os.path.isdir(year_path):
            continue
        files = [f for f in os.listdir(year_path) if '10-K' in f]
        if files:
            file_path = os.path.join(year_path, files[0])
            file_list.append((cik, year_folder, file_path))

# Sort the file_list by year_folder and then cik
file_list = sorted(file_list, key=lambda x: (x[1], x[0]))  # x[1] is year_folder, x[0] is cik

if PROCESS_FROM_BOTTOM:
    file_list = file_list[::-1]
    print("Processing from the bottom (latest years first)")
else:
    print("Processing from the top (earliest years first)")

total_files = len(file_list)
processed_count = 0
metrics_inserted = 0

# Process files in parallel using a thread pool
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(process_file, args) for args in file_list]
    for future in tqdm(as_completed(futures), total=total_files, desc="Processing 10-K files"):
        result = future.result()
        processed_count += 1
        if result is not None:
            metrics_inserted += result
        if processed_count % 100 == 0:
            print(f"Processed {processed_count}/{total_files} files, inserted {metrics_inserted} metrics")

print(f"Processing complete. Total files processed: {processed_count}, Total metrics inserted: {metrics_inserted}")

Processing 10-K files:   0%|          | 109/46863 [00:01<14:00, 55.65it/s]

Processed 100/46863 files, inserted 0 metrics


Processing 10-K files:   0%|          | 205/46863 [00:04<19:46, 39.34it/s]

Processed 200/46863 files, inserted 0 metrics


Processing 10-K files:   1%|          | 305/46863 [00:06<17:53, 43.35it/s]

Processed 300/46863 files, inserted 0 metrics


Processing 10-K files:   1%|          | 409/46863 [00:08<13:12, 58.65it/s]

Processed 400/46863 files, inserted 0 metrics


Processing 10-K files:   1%|          | 507/46863 [00:10<14:20, 53.85it/s]

Processed 500/46863 files, inserted 0 metrics


Processing 10-K files:   1%|▏         | 601/46863 [00:12<15:43, 49.03it/s]

Processed 600/46863 files, inserted 0 metrics


Processing 10-K files:   2%|▏         | 710/46863 [00:14<13:04, 58.85it/s]

Processed 700/46863 files, inserted 0 metrics


Processing 10-K files:   2%|▏         | 801/46863 [00:15<16:08, 47.54it/s]

Processed 800/46863 files, inserted 0 metrics


Processing 10-K files:   2%|▏         | 911/46863 [00:17<15:42, 48.75it/s]

Processed 900/46863 files, inserted 0 metrics


Processing 10-K files:   2%|▏         | 1004/46863 [00:19<12:14, 62.41it/s]

Processed 1000/46863 files, inserted 0 metrics


Processing 10-K files:   2%|▏         | 1111/46863 [00:21<12:24, 61.43it/s]

Processed 1100/46863 files, inserted 0 metrics


Processing 10-K files:   3%|▎         | 1214/46863 [00:23<11:37, 65.41it/s]

Processed 1200/46863 files, inserted 0 metrics


Processing 10-K files:   3%|▎         | 1310/46863 [00:25<12:06, 62.69it/s]

Processed 1300/46863 files, inserted 0 metrics


Processing 10-K files:   3%|▎         | 1415/46863 [00:27<10:01, 75.61it/s]

Processed 1400/46863 files, inserted 0 metrics


Processing 10-K files:   3%|▎         | 1487/46863 [00:28<19:40, 38.43it/s]

Error reading C:\Users\willi\Documents\GitHub\Stock-Selector\data\edgar\filings\1020859\2013\10-K_2013-10-01.txt: 'utf-8' codec can't decode byte 0xa2 in position 6894275: invalid start byte


Processing 10-K files:   3%|▎         | 1507/46863 [00:29<20:30, 36.87it/s]

Processed 1500/46863 files, inserted 0 metrics


Processing 10-K files:   3%|▎         | 1606/46863 [00:31<14:41, 51.37it/s]

Processed 1600/46863 files, inserted 0 metrics


Processing 10-K files:   4%|▎         | 1704/46863 [00:33<19:03, 39.49it/s]

Processed 1700/46863 files, inserted 0 metrics


Processing 10-K files:   4%|▍         | 1805/46863 [00:34<13:50, 54.25it/s]

Processed 1800/46863 files, inserted 0 metrics


Processing 10-K files:   4%|▍         | 1894/46863 [00:36<24:58, 30.00it/s]

Processed 1900/46863 files, inserted 0 metrics


Processing 10-K files:   4%|▍         | 2003/46863 [00:39<19:51, 37.64it/s]

Processed 2000/46863 files, inserted 0 metrics


Processing 10-K files:   4%|▍         | 2105/46863 [00:40<10:54, 68.39it/s]

Processed 2100/46863 files, inserted 0 metrics


Processing 10-K files:   5%|▍         | 2206/46863 [00:42<16:55, 43.98it/s]

Processed 2200/46863 files, inserted 0 metrics


Processing 10-K files:   5%|▍         | 2304/46863 [00:44<17:30, 42.42it/s]

Processed 2300/46863 files, inserted 0 metrics


Processing 10-K files:   5%|▌         | 2406/46863 [00:46<09:31, 77.84it/s]

Processed 2400/46863 files, inserted 0 metrics


Processing 10-K files:   5%|▌         | 2508/46863 [00:47<07:16, 101.53it/s]

Processed 2500/46863 files, inserted 0 metrics


Processing 10-K files:   6%|▌         | 2603/46863 [00:49<10:58, 67.20it/s] 

Processed 2600/46863 files, inserted 0 metrics


Processing 10-K files:   6%|▌         | 2705/46863 [00:51<12:40, 58.04it/s]

Processed 2700/46863 files, inserted 0 metrics


Processing 10-K files:   6%|▌         | 2806/46863 [00:52<11:10, 65.69it/s]

Processed 2800/46863 files, inserted 0 metrics


Processing 10-K files:   6%|▌         | 2906/46863 [00:55<13:32, 54.08it/s]

Processed 2900/46863 files, inserted 0 metrics


Processing 10-K files:   6%|▋         | 3004/46863 [00:57<17:43, 41.22it/s]

Processed 3000/46863 files, inserted 0 metrics


Processing 10-K files:   7%|▋         | 3101/46863 [00:58<10:14, 71.26it/s]

Processed 3100/46863 files, inserted 0 metrics


Processing 10-K files:   7%|▋         | 3205/46863 [01:00<15:22, 47.33it/s]

Processed 3200/46863 files, inserted 0 metrics


Processing 10-K files:   7%|▋         | 3307/46863 [01:02<13:10, 55.13it/s]

Processed 3300/46863 files, inserted 0 metrics


Processing 10-K files:   7%|▋         | 3409/46863 [01:04<16:12, 44.68it/s]

Processed 3400/46863 files, inserted 0 metrics


Processing 10-K files:   8%|▊         | 3523/46863 [01:06<07:53, 91.57it/s]

Processed 3500/46863 files, inserted 0 metrics


Processing 10-K files:   8%|▊         | 3609/46863 [01:07<14:16, 50.52it/s]

Processed 3600/46863 files, inserted 0 metrics


Processing 10-K files:   8%|▊         | 3707/46863 [01:10<16:19, 44.07it/s]

Processed 3700/46863 files, inserted 0 metrics


Processing 10-K files:   8%|▊         | 3796/46863 [01:11<13:34, 52.89it/s]

Processed 3800/46863 files, inserted 0 metrics


Processing 10-K files:   8%|▊         | 3912/46863 [01:13<10:26, 68.59it/s]

Processed 3900/46863 files, inserted 0 metrics


Processing 10-K files:   9%|▊         | 4012/46863 [01:15<12:33, 56.85it/s]

Processed 4000/46863 files, inserted 0 metrics


Processing 10-K files:   9%|▉         | 4106/46863 [01:16<09:04, 78.50it/s]

Processed 4100/46863 files, inserted 0 metrics


Processing 10-K files:   9%|▉         | 4211/46863 [01:18<09:30, 74.72it/s]

Processed 4200/46863 files, inserted 0 metrics


Processing 10-K files:   9%|▉         | 4302/46863 [01:20<13:57, 50.81it/s]

Processed 4300/46863 files, inserted 0 metrics


Processing 10-K files:   9%|▉         | 4401/46863 [01:21<10:37, 66.61it/s]

Processed 4400/46863 files, inserted 0 metrics


Processing 10-K files:  10%|▉         | 4514/46863 [01:24<08:37, 81.82it/s]

Processed 4500/46863 files, inserted 0 metrics


Processing 10-K files:  10%|▉         | 4608/46863 [01:25<11:57, 58.86it/s]

Processed 4600/46863 files, inserted 0 metrics


Processing 10-K files:  10%|█         | 4702/46863 [01:27<12:03, 58.25it/s]

Processed 4700/46863 files, inserted 0 metrics


Processing 10-K files:  10%|█         | 4810/46863 [01:30<10:34, 66.28it/s]

Processed 4800/46863 files, inserted 0 metrics


Processing 10-K files:  10%|█         | 4898/46863 [01:32<15:57, 43.81it/s]

Processed 4900/46863 files, inserted 0 metrics


Processing 10-K files:  11%|█         | 5003/46863 [01:34<13:27, 51.85it/s]

Processed 5000/46863 files, inserted 0 metrics


Processing 10-K files:  11%|█         | 5111/46863 [01:36<12:52, 54.05it/s]

Processed 5100/46863 files, inserted 0 metrics


Processing 10-K files:  11%|█         | 5213/46863 [01:37<11:28, 60.50it/s]

Processed 5200/46863 files, inserted 0 metrics


Processing 10-K files:  11%|█▏        | 5304/46863 [01:39<12:19, 56.22it/s]

Processed 5300/46863 files, inserted 0 metrics


Processing 10-K files:  12%|█▏        | 5411/46863 [01:41<10:57, 63.05it/s]

Processed 5400/46863 files, inserted 0 metrics


Processing 10-K files:  12%|█▏        | 5506/46863 [01:43<11:36, 59.42it/s]

Processed 5500/46863 files, inserted 0 metrics


Processing 10-K files:  12%|█▏        | 5606/46863 [01:44<08:51, 77.56it/s] 

Processed 5600/46863 files, inserted 0 metrics


Processing 10-K files:  12%|█▏        | 5701/46863 [01:46<19:36, 34.97it/s]

Processed 5700/46863 files, inserted 0 metrics


Processing 10-K files:  12%|█▏        | 5806/46863 [01:49<13:05, 52.27it/s]

Processed 5800/46863 files, inserted 0 metrics


Processing 10-K files:  13%|█▎        | 5904/46863 [01:51<15:07, 45.15it/s]

Processed 5900/46863 files, inserted 0 metrics


Processing 10-K files:  13%|█▎        | 6004/46863 [01:53<14:35, 46.69it/s]

Processed 6000/46863 files, inserted 0 metrics


Processing 10-K files:  13%|█▎        | 6095/46863 [01:55<12:15, 55.39it/s]

Processed 6100/46863 files, inserted 0 metrics


Processing 10-K files:  13%|█▎        | 6206/46863 [01:57<16:45, 40.45it/s] 

Processed 6200/46863 files, inserted 0 metrics


Processing 10-K files:  13%|█▎        | 6302/46863 [23:39<99:04:27,  8.79s/it] 

Processed 6300/46863 files, inserted 5047 metrics


Processing 10-K files:  14%|█▎        | 6400/46863 [43:54<259:23:22, 23.08s/it]

Processed 6400/46863 files, inserted 12316 metrics


Processing 10-K files:  14%|█▍        | 6501/46863 [55:34<56:20:44,  5.03s/it] 

Processed 6500/46863 files, inserted 16037 metrics


Processing 10-K files:  14%|█▍        | 6587/46863 [1:01:34<80:26:09,  7.19s/it] 

Error reading C:\Users\willi\Documents\GitHub\Stock-Selector\data\edgar\filings\1098009\2013\10-K_2013-10-02.txt: 'utf-8' codec can't decode byte 0x80 in position 1047743: invalid start byte


Processing 10-K files:  14%|█▍        | 6600/46863 [1:04:59<361:49:00, 32.35s/it]

Processed 6600/46863 files, inserted 20158 metrics


Processing 10-K files:  14%|█▍        | 6700/46863 [1:14:23<80:27:57,  7.21s/it] 

Processed 6700/46863 files, inserted 27931 metrics


Processing 10-K files:  14%|█▍        | 6795/46863 [12:39:18<74:37:23,  6.70s/it]     


In [3]:
import pandas as pd
import os
import shutil

# Step 1: Read the CSV file
# Replace the path with your actual CSV file location
csv_path = 'C:/Users/willi/Documents/GitHub/Stock-Selector/data/consolidated_stock_list.csv'
df = pd.read_csv(csv_path, dtype={'cik': str})

# Step 2: Filter rows where ipo_date is "Not Found" and get unique CIKs
ciks_to_delete = df[df['ipo_date'] == "Not Found"]['cik'].unique().tolist()

# Step 3: Define the filings directory
filings_dir = 'C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings'

# Step 4: Process all CIK folders
deleted_cik_count = 0
deleted_year_count = 0

for cik_folder in os.listdir(filings_dir):
    folder_path = os.path.join(filings_dir, cik_folder)
    if not os.path.isdir(folder_path):
        continue

    cik = cik_folder

    # Delete entire CIK folder if it has no IPO date
    if cik in ciks_to_delete:
        try:
            shutil.rmtree(folder_path)
            deleted_cik_count += 1
            print(f"Deleted folder for CIK {cik} (no IPO date)")
        except Exception as e:
            print(f"Error deleting folder for CIK {cik}: {e}")
        continue  # Move to the next CIK folder

    # For CIKs with an IPO date, check year folders
    for year_folder in os.listdir(folder_path):
        year_path = os.path.join(folder_path, year_folder)
        if not os.path.isdir(year_path):
            continue

        try:
            year = int(year_folder)
            if year < 2010:
                shutil.rmtree(year_path)
                deleted_year_count += 1
                print(f"Deleted year folder {year_path} (year < 2010)")
        except ValueError:
            print(f"Skipping non-numeric year folder: {year_folder} in CIK {cik}")
        except Exception as e:
            print(f"Error deleting year folder {year_path}: {e}")

# Step 5: Summary
print(f"Deletion complete. Deleted {deleted_cik_count} CIK folders and {deleted_year_count} year folders.")

Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000045\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000180\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000209\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000228\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000229\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000230\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000232\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000234\2009 (year < 2010)
Deleted year folder C:/Users/willi/Documents/GitHub/Stock-Selector/data/edgar/filings\1000278\20