In [None]:
# Cell 1
# !pip install --upgrade polars
# !pip install pyspark
!pip install "git+https://github.com/Erew42/NLP_Thesis.git@main"
# !pip install --upgrade "git+https://github.com/Erew42/NLP_Thesis.git@main"

Collecting git+https://github.com/Erew42/NLP_Thesis.git@main
  Cloning https://github.com/Erew42/NLP_Thesis.git (to revision main) to /tmp/pip-req-build-jsesgxgq
  Running command git clone --filter=blob:none --quiet https://github.com/Erew42/NLP_Thesis.git /tmp/pip-req-build-jsesgxgq
  Resolved https://github.com/Erew42/NLP_Thesis.git to commit 2f7cb1bc107bfe23b0c94918dbf7728bcf2065cf
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: thesis_pkg
  Building wheel for thesis_pkg (pyproject.toml) ... [?25l[?25hdone
  Created wheel for thesis_pkg: filename=thesis_pkg-0.1.0-py3-none-any.whl size=33869 sha256=c4aa67e1f0b2de5dcfb7a1bc0b6b1dc52371d6e15ef8a148a856507b50456f9e
  Stored in directory: /tmp/pip-ephem-wheel-cache-8az2h5iv/wheels/2e/cf/d9/da9e5ac1f7c998d0e8b0344c05c481da8d5177f232d69b1537
Successfully built thesis_pkg
Insta

In [None]:
# Cell 2

import pandas as pd
import numpy as np
from pathlib import Path
import polars as pl
import gc
from pyspark import sql
import datetime as dt
import os
import polars.selectors as cs

from thesis_pkg.filing_text import (
    process_zip_year_raw_text,
    process_zip_year,
    merge_yearly_batches,
    summarize_year_parquets,
    build_light_metadata_dataset,
    concat_parquets_arrow,
    process_year_dir_extract_items,
)

from thesis_pkg.pipelines.sec_pipeline import (
    compute_no_item_diagnostics,
)

In [None]:

# Cell 4

# --- Set Global Display Options for Polars ---

# Set the maximum number of rows to display for any DataFrame/LazyFrame print
# For example, to show a maximum of 15 rows:
pl.Config.set_tbl_rows(50)

# Optionally, you can also set the maximum number of columns if needed:
pl.Config.set_tbl_cols(50)

In [None]:
from pathlib import Path

# Mount Google Drive
try:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not in Colab environment. Assuming local file access.")

DRIVE_ROOT = Path("/content/drive/My Drive/Data_LM")

ZIP_DIR = DRIVE_ROOT / "Zip_Files"                         # contains 1993.zip, 1994.zip, ...
BATCH_ROOT = DRIVE_ROOT / "parquet_data"                   # will contain per-year folders with batches
YEAR_MERGED_DIR = DRIVE_ROOT / "parquet_data" / "_year_merged"
LIGHT_OUT = DRIVE_ROOT / "parquet_data" / "filings_metadata_1993_2024_LIGHT.parquet"
Year_Items_OUT = DRIVE_ROOT / "parquet_data" / "year_items"

# local work dirs (VM disk)
LOCAL_TMP = Path("/tmp/filings_tmp")
LOCAL_MERGE_WORK = Path("/tmp/_merge_work")
LOCAL_TMP.mkdir(parents=True, exist_ok=True)
LOCAL_MERGE_WORK.mkdir(parents=True, exist_ok=True)


Mounted at /content/drive
Google Drive mounted successfully.


In [None]:
import os
mp="/content/drive"
print("exists:", os.path.exists(mp))
print("isdir:", os.path.isdir(mp))
print("islink:", os.path.islink(mp))
print("contents:", os.listdir(mp) if os.path.isdir(mp) else None)


exists: True
isdir: True
islink: False
contents: ['.shortcut-targets-by-id', 'MyDrive', 'Othercomputers', '.Trash-0', '.Encrypted']


In [None]:
from tqdm import tqdm
from pathlib import Path

YEARS = range(1993, 2025)
MODE = "parsed"   # "raw" or "parsed"

LOCAL_WORK = Path("/content/_batch_work")
LOCAL_TMP  = Path("/content/_tmp_zip")
LOCAL_WORK.mkdir(parents=True, exist_ok=True)
LOCAL_TMP.mkdir(parents=True, exist_ok=True)

SPECIAL_YEARS = {2012, 2021}

common_kwargs = dict(
    tmp_dir=LOCAL_TMP,
    local_work_dir=LOCAL_WORK,
    compression="zstd",
    copy_retries=5,           # bump for Drive
    copy_sleep=2.0,
    validate_on_copy=True,
)

for year in tqdm(YEARS):
    zip_path = ZIP_DIR / f"{year}.zip"
    if not zip_path.exists():
        print(f"[skip] missing zip: {zip_path}")
        continue

    year_out_dir = BATCH_ROOT / str(year)
    year_out_dir.mkdir(parents=True, exist_ok=True)

    # Prefer a success sentinel if you can (see note below). As-is, keep your heuristic:
    if next(year_out_dir.glob(f"{year}_batch_*.parquet"), None) is not None:
        print(f"[skip] {year} (batches already exist)")
        continue

    # Row-limit policy
    if year in SPECIAL_YEARS:
        batch_max_rows = 10
    else:
        batch_max_rows = 1000 if MODE == "raw" else 2000

    print(f"\n[year] {year} -> {year_out_dir} (MODE={MODE}, batch_max_rows={batch_max_rows})")

    if MODE == "raw":
        written = process_zip_year_raw_text(
            zip_path=zip_path,
            out_dir=year_out_dir,
            batch_max_rows=batch_max_rows,
            batch_max_text_bytes=250 * 1024 * 1024,
            encoding="utf-8",
            **common_kwargs,
        )
    else:
        written = process_zip_year(
            zip_path=zip_path,
            out_dir=year_out_dir,
            batch_max_rows=batch_max_rows,
            batch_max_text_bytes=250 * 1024 * 1024,
            header_search_limit=8000,
            encoding="utf-8",
            **common_kwargs,
        )

    print(f"[done] {year}: wrote {len(written)} batch files")


 62%|██████▎   | 20/32 [00:00<00:00, 138.28it/s]

[skip] 1993 (batches already exist)
[skip] 1994 (batches already exist)
[skip] 1995 (batches already exist)
[skip] 1996 (batches already exist)
[skip] 1997 (batches already exist)
[skip] 1998 (batches already exist)
[skip] 1999 (batches already exist)
[skip] 2000 (batches already exist)
[skip] 2001 (batches already exist)
[skip] 2002 (batches already exist)
[skip] 2003 (batches already exist)
[skip] 2004 (batches already exist)
[skip] 2005 (batches already exist)
[skip] 2006 (batches already exist)
[skip] 2007 (batches already exist)
[skip] 2008 (batches already exist)
[skip] 2009 (batches already exist)
[skip] 2010 (batches already exist)
[skip] 2011 (batches already exist)
[skip] 2012 (batches already exist)
[skip] 2013 (batches already exist)
[skip] 2014 (batches already exist)
[skip] 2015 (batches already exist)
[skip] 2016 (batches already exist)
[skip] 2017 (batches already exist)
[skip] 2018 (batches already exist)
[skip] 2019 (batches already exist)
[skip] 2020 (batches already

100%|██████████| 32/32 [07:49<00:00, 14.67s/it]

[done] 2021: wrote 2932 batch files
[skip] 2022 (batches already exist)
[skip] 2023 (batches already exist)
[skip] 2024 (batches already exist)





In [None]:
from pathlib import Path
import os
import pyarrow.parquet as pq

p = Path("/content/drive/My Drive/Data_LM/parquet_data/2012/2012_batch_0004.parquet")
print("exists:", p.exists())
print("size:", p.stat().st_size, "bytes")

# Try reading only metadata/footer (often succeeds even if a row group is corrupted)
pf = pq.ParquetFile(p)
print("num_row_groups:", pf.metadata.num_row_groups)
print("schema:", pf.schema_arrow)

import pyarrow as pa
import pyarrow.parquet as pq

def salvage_parquet_rowgroups(src_path, dst_path):
    pf = pq.ParquetFile(src_path)
    good_tables = []
    bad = []

    for i in range(pf.metadata.num_row_groups):
        try:
            t = pf.read_row_group(i)
            good_tables.append(t)
        except Exception as e:
            bad.append((i, repr(e)))
            break  # often everything after is suspect if file is truncated

    print(f"readable row groups: {len(good_tables)} / {pf.metadata.num_row_groups}")
    if bad:
        print("first bad row group:", bad[0])

    if not good_tables:
        raise RuntimeError("No readable row groups; salvage failed.")

    combined = pa.concat_tables(good_tables, promote=True)
    pq.write_table(combined, dst_path)
    return bad

src = "/content/drive/My Drive/Data_LM/parquet_data/2012/2012_batch_0004.parquet"
dst = "/content/drive/My Drive/Data_LM/parquet_data/2012/2012_batch_0004_salvaged.parquet"
bad_info = salvage_parquet_rowgroups(src, dst)


In [None]:
CHECKPOINT = LOCAL_MERGE_WORK / "done_years.json"

merged_paths = merge_yearly_batches(
    batch_dir=BATCH_ROOT,                 # <-- points to parquet_data/ which contains YYYY/ folders
    out_dir=YEAR_MERGED_DIR,
    checkpoint_path=CHECKPOINT,
    local_work_dir=LOCAL_MERGE_WORK,
    batch_size=128_000,
    compression="zstd",
    compression_level=1,
    sleep_between_years=1.0,              # optional
    validate_inputs="full",
)

print(f"[done] merged {len(merged_paths)} year files into {YEAR_MERGED_DIR}")


In [None]:
years = [str(y) for y in range(2021, 2025)]

CHECKPOINT = LOCAL_MERGE_WORK / "done_years.json"

merged_paths = merge_yearly_batches(
    batch_dir=BATCH_ROOT,
    out_dir=YEAR_MERGED_DIR,
    checkpoint_path=CHECKPOINT,
    local_work_dir=LOCAL_MERGE_WORK,
    batch_size=128_000,
    compression="zstd",
    compression_level=1,
    sleep_between_years=1.0,
    validate_inputs="full",
    years= years,  # new
)


In [None]:
import polars as pl

summaries = summarize_year_parquets(YEAR_MERGED_DIR)
df_sum = pl.DataFrame(summaries).with_columns(
    pl.col("size_bytes") / (1024 * 1024)
).rename({"size_bytes": "size_mb"})

print(df_sum.select(["year", "rows", "size_mb", "status"]).sort("year"))

# If you only want OK files:
ok_files = [row["path"] for row in summaries if row["status"] == "OK"]

out_path = build_light_metadata_dataset(
    parquet_dir=ok_files,     # pass explicit list (or pass YEAR_MERGED_DIR)
    out_path=LIGHT_OUT,
    drop_columns=("full_text",),
    sort_columns=("file_date_filename", "cik"),
    compression="zstd",
)

print(f"\n[done] wrote light metadata to: {out_path}")
print(pl.read_parquet(out_path, n_rows=5))


shape: (32, 4)
┌──────┬───────┬─────────────┬────────┐
│ year ┆ rows  ┆ size_mb     ┆ status │
│ ---  ┆ ---   ┆ ---         ┆ ---    │
│ str  ┆ i64   ┆ f64         ┆ str    │
╞══════╪═══════╪═════════════╪════════╡
│ 1993 ┆ 13    ┆ 0.341699    ┆ OK     │
│ 1994 ┆ 9752  ┆ 306.949292  ┆ OK     │
│ 1995 ┆ 21880 ┆ 520.234069  ┆ OK     │
│ 1996 ┆ 44531 ┆ 1030.849917 ┆ OK     │
│ 1997 ┆ 55484 ┆ 1546.285963 ┆ OK     │
│ 1998 ┆ 55735 ┆ 1705.164625 ┆ OK     │
│ 1999 ┆ 56330 ┆ 1737.03751  ┆ OK     │
│ 2000 ┆ 59419 ┆ 1773.694546 ┆ OK     │
│ 2001 ┆ 56873 ┆ 1728.944741 ┆ OK     │
│ 2002 ┆ 54220 ┆ 1805.7707   ┆ OK     │
│ 2003 ┆ 50399 ┆ 1898.304109 ┆ OK     │
│ 2004 ┆ 49821 ┆ 1964.382998 ┆ OK     │
│ 2005 ┆ 50717 ┆ 2011.29054  ┆ OK     │
│ 2006 ┆ 49032 ┆ 2051.164263 ┆ OK     │
│ 2007 ┆ 47990 ┆ 2076.952863 ┆ OK     │
│ 2008 ┆ 47216 ┆ 2033.592137 ┆ OK     │
│ 2009 ┆ 42359 ┆ 2007.852787 ┆ OK     │
│ 2010 ┆ 40024 ┆ 1935.650404 ┆ OK     │
│ 2011 ┆ 40087 ┆ 1842.375597 ┆ OK     │
│ 2012 ┆ 37498 ┆ 1774.036

In [None]:
print(pl.read_parquet(out_path, n_rows=5))


shape: (5, 17)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
│ doc ┆ cik ┆ cik ┆ acc ┆ acc ┆ fil ┆ fil ┆ fil ┆ doc ┆ fil ┆ zip ┆ fil ┆ cik ┆ cik ┆ acc ┆ cik ┆ acc │
│ _id ┆ --- ┆ _10 ┆ ess ┆ ess ┆ ing ┆ ing ┆ e_d ┆ ume ┆ ena ┆ _me ┆ ena ┆ _he ┆ s_h ┆ ess ┆ _co ┆ ess │
│ --- ┆ i64 ┆ --- ┆ ion ┆ ion ┆ _da ┆ _da ┆ ate ┆ nt_ ┆ me  ┆ mbe ┆ me_ ┆ ade ┆ ead ┆ ion ┆ nfl ┆ ion │
│ str ┆     ┆ str ┆ _nu ┆ _no ┆ te  ┆ te_ ┆ _fi ┆ typ ┆ --- ┆ r_p ┆ par ┆ r_p ┆ er_ ┆ _he ┆ ict ┆ _co │
│     ┆     ┆     ┆ mbe ┆ das ┆ --- ┆ hea ┆ len ┆ e_f ┆ str ┆ ath ┆ se_ ┆ rim ┆ sec ┆ ade ┆ --- ┆ nfl │
│     ┆     ┆     ┆ r   ┆ h   ┆ dat ┆ der ┆ ame ┆ ile ┆     ┆ --- ┆ ok  ┆ ary ┆ ond ┆ r   ┆ boo ┆ ict │
│     ┆     ┆     ┆ --- ┆ --- ┆ e   ┆ --- ┆ --- ┆ nam ┆     ┆ str ┆ --- ┆ --- ┆ ary ┆ --- ┆ l   ┆ --- │
│     ┆     ┆     ┆ str ┆ str ┆     ┆ str ┆ dat ┆ e   ┆     ┆     ┆ boo ┆ str ┆ --- ┆ str ┆     ┆ boo │
│     ┆     ┆     ┆     ┆     ┆     ┆     ┆ e   ┆

In [None]:
years = [str(y) for y in range(1993, 2025)]


process_year_dir_extract_items(
    year_dir= YEAR_MERGED_DIR,
    out_dir = Year_Items_OUT,
    years = years,
    tmp_dir = LOCAL_TMP,
    local_work_dir = LOCAL_MERGE_WORK
    )

[items] 1993 filings=13 items=87 items_per_filing=6.69 no_item_filings=0 empty_items=0 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/1993.parquet
[items] 1994 filings=9752 items=55170 items_per_filing=5.66 no_item_filings=505 empty_items=88 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/1994.parquet
[items] 1995 filings=21880 items=117561 items_per_filing=5.37 no_item_filings=1187 empty_items=300 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/1995.parquet
[items] 1996 filings=44531 items=268733 items_per_filing=6.03 no_item_filings=1632 empty_items=543 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/1996.parquet
[items] 1997 filings=55484 items=374818 items_per_filing=6.76 no_item_filings=1571 empty_items=577 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/1997.parquet
[items] 1998 filings=55735 items=395733 items_per_filing=7.10 no_item_filings=1185 empty_items=612 -> /content/drive/My Drive/Data_LM/parquet_data/year_items/199

[PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1993.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1994.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1995.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1996.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1997.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1998.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/1999.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/2000.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/2001.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/2002.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/2003.parquet'),
 PosixPath('/content/drive/My Drive/Data_LM/parquet_data/year_items/2004.par

In [None]:
print(Year_Items_OUT)
Year_Items_OUT_Stats = Year_Items_OUT / "No_Item_Filings"
print(Year_Items_OUT_Stats)



/content/drive/My Drive/Data_LM/parquet_data/year_items
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings


In [None]:
Year_Items_OUT_Stats = Year_Items_OUT / "No_Item_Filings"

years = [str(y) for y in range(1994, 2025)]

for year in years:
  temp = (year, "parquet")
  temp_2 = (year, "csv")
  temp1 = ".".join(temp)
  temp2 = ".".join(temp_2)

  temp_path1 = YEAR_MERGED_DIR / temp1
  print(temp_path1)
  temp_path2 = Year_Items_OUT / temp1
  print(temp_path2)
  temp_path3 = Year_Items_OUT_Stats / temp1
  print(temp_path3)
  temp_path4 = Year_Items_OUT_Stats / temp2
  print(temp_path4)
  compute_no_item_diagnostics(temp_path1,temp_path2,temp_path3,temp_path4,True)

/content/drive/My Drive/Data_LM/parquet_data/_year_merged/1994.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/1994.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1994.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1994.csv
/content/drive/My Drive/Data_LM/parquet_data/_year_merged/1995.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/1995.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1995.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1995.csv
/content/drive/My Drive/Data_LM/parquet_data/_year_merged/1996.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/1996.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1996.parquet
/content/drive/My Drive/Data_LM/parquet_data/year_items/No_Item_Filings/1996.csv
/content/drive/My Drive/Data_LM/parquet_data/_year_merged/1997.parquet
/content/dr

In [None]:
years = [str(y) for y in range(1994, 2025)]


process_year_dir_extract_items(
    year_dir= YEAR_MERGED_DIR,
    out_dir = Year_Items_OUT,
    years = years,
    tmp_dir = LOCAL_TMP,
    local_work_dir = LOCAL_MERGE_WORK
    )

In [None]:
import os
import re
import zipfile
import shutil
import random
from pathlib import Path

# ----------------------------
# Config
# ----------------------------
drive_zip_folder = "/content/drive/My Drive/Data_LM/Zip_Files/"
sample_root = os.path.join(drive_zip_folder, "sample_data")

years = list(range(1995, 2025))  # exclude 1993/1994 as requested
seed = 123                        # reproducibility
pct_0p5 = 0.005
pct_1p0 = 0.01

# Output folders
out_0p5 = os.path.join(sample_root, "pct_0p5")
out_1p0 = os.path.join(sample_root, "pct_1p0")
os.makedirs(out_0p5, exist_ok=True)
os.makedirs(out_1p0, exist_ok=True)

# ----------------------------
# Robust CIK extraction from filename (NOT file contents)
# Handles:
#  - QTR1/123456_0000-00-0000.txt
#  - QTR1/19950103_10-K_edgar_data_123456_0000-00-0000.txt
#  - other variants as long as they include a CIK token
# ----------------------------
re_edgar = re.compile(r"edgar_data_(\d+)_", re.IGNORECASE)  # ...edgar_data_{CIK}_...
re_prefix_digits = re.compile(r"^(\d{1,12})_")              # {CIK}_... OR {DATE}_...
re_any_digits = re.compile(r"(\d{1,12})")                  # fallback: any digit run

def cik_from_member_name(member_name: str) -> int | None:
    base = os.path.basename(member_name)
    if not base.lower().endswith(".txt"):
        return None

    m = re_edgar.search(base)
    if m:
        return int(m.group(1))

    # If format is cik_acc.txt, the first token before "_" is the CIK.
    # But beware: some formats start with a date (YYYYMMDD). We'll filter those.
    m = re_prefix_digits.match(base)
    if m:
        tok = m.group(1)
        # Heuristic: treat 8-digit tokens in plausible date range as a date, not a CIK
        if len(tok) == 8 and 19900101 <= int(tok) <= 20351231:
            # Try to find a later CIK-like token by looking for the next meaningful digit run
            m2 = re_edgar.search(base)
            if m2:
                return int(m2.group(1))
            # last resort: look for another digit run that is NOT 8-digit date-like
            nums = re_any_digits.findall(base)
            for n in nums:
                if not (len(n) == 8 and 19900101 <= int(n) <= 20351231):
                    # a mild guard: CIKs are typically <= 10 digits, but keep <=12 to be safe
                    return int(n)
            return None
        return int(tok)

    # Fallback: find any digit run that isn't a date-like 8-digit token
    nums = re_any_digits.findall(base)
    for n in nums:
        if not (len(n) == 8 and 19900101 <= int(n) <= 20351231):
            return int(n)

    return None


# ----------------------------
# 1) Build universe of CIKs across all years (1995-2024)
# ----------------------------
all_ciks = set()

for y in years:
    zip_path = os.path.join(drive_zip_folder, f"{y}.zip")
    if not os.path.exists(zip_path):
        print(f"SKIP (missing): {zip_path}")
        continue

    with zipfile.ZipFile(zip_path, "r") as zin:
        for info in zin.infolist():
            # ignore directories
            if info.is_dir():
                continue
            # only txt
            if not info.filename.lower().endswith(".txt"):
                continue
            cik = cik_from_member_name(info.filename)
            if cik is not None:
                all_ciks.add(cik)

all_ciks = sorted(all_ciks)
n = len(all_ciks)
print(f"Unique companies (CIKs) found across {years[0]}-{years[-1]}: {n:,}")

if n == 0:
    raise RuntimeError("No CIKs found. Check your ZIP contents / filename conventions.")

# ----------------------------
# 2) Draw samples (0.5% and 1%)
#    - 0.5% is a subset of 1% to make your testing incremental.
# ----------------------------
rng = random.Random(seed)
k1 = max(1, int(round(pct_1p0 * n)))
k05 = max(1, int(round(pct_0p5 * n)))

sample_1p0 = set(rng.sample(all_ciks, k1))
# nested subset for 0.5%
sample_0p5 = set(rng.sample(sorted(sample_1p0), k05))

print(f"Sample sizes: 1.0% -> {len(sample_1p0):,} CIKs, 0.5% -> {len(sample_0p5):,} CIKs (seed={seed})")

# Persist the sampled CIK lists for reproducibility/auditing
Path(sample_root).mkdir(parents=True, exist_ok=True)
with open(os.path.join(sample_root, f"sample_ciks_1p0_seed{seed}.txt"), "w") as f:
    for cik in sorted(sample_1p0):
        f.write(f"{cik}\n")

with open(os.path.join(sample_root, f"sample_ciks_0p5_seed{seed}.txt"), "w") as f:
    for cik in sorted(sample_0p5):
        f.write(f"{cik}\n")


# ----------------------------
# 3) Create year-by-year sample ZIPs, preserving internal structure (e.g., QTR1/...)
#    - No extraction to disk; streaming copy member-by-member.
# ----------------------------
def write_year_sample_zip(year: int, sample_set: set[int], out_dir: str) -> tuple[int, int]:
    in_zip = os.path.join(drive_zip_folder, f"{year}.zip")
    out_zip = os.path.join(out_dir, f"{year}.zip")

    if not os.path.exists(in_zip):
        return (0, 0)

    kept = 0
    total_txt = 0

    # Overwrite if exists (explicit)
    if os.path.exists(out_zip):
        os.remove(out_zip)

    with zipfile.ZipFile(in_zip, "r") as zin, zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as zout:
        for info in zin.infolist():
            if info.is_dir() or (not info.filename.lower().endswith(".txt")):
                continue

            total_txt += 1
            cik = cik_from_member_name(info.filename)
            if cik is None or cik not in sample_set:
                continue

            # Stream-copy to avoid holding full file in RAM
            with zin.open(info, "r") as src, zout.open(info.filename, "w") as dst:
                shutil.copyfileobj(src, dst, length=1024 * 1024)

            kept += 1

    return total_txt, kept


print("\n--- Writing 1.0% sample ZIPs ---")
for y in years:
    total_txt, kept = write_year_sample_zip(y, sample_1p0, out_1p0)
    if total_txt == 0 and kept == 0:
        print(f"{y}: SKIP (missing or empty)")
    else:
        print(f"{y}: kept {kept:,} / {total_txt:,} txt files")

print("\n--- Writing 0.5% sample ZIPs ---")
for y in years:
    total_txt, kept = write_year_sample_zip(y, sample_0p5, out_0p5)
    if total_txt == 0 and kept == 0:
        print(f"{y}: SKIP (missing or empty)")
    else:
        print(f"{y}: kept {kept:,} / {total_txt:,} txt files")

print("\nDONE.")
print(f"1.0% sample ZIPs: {out_1p0}")
print(f"0.5% sample ZIPs: {out_0p5}")
print(f"Sample CIK lists saved in: {sample_root}")

Unique companies (CIKs) found across 1995-2024: 45,697
Sample sizes: 1.0% -> 457 CIKs, 0.5% -> 228 CIKs (seed=123)

--- Writing 1.0% sample ZIPs ---
1995: kept 245 / 21,880 txt files
1996: kept 452 / 44,531 txt files
1997: kept 514 / 55,484 txt files
1998: kept 515 / 55,735 txt files
1999: kept 510 / 56,330 txt files
2000: kept 546 / 59,419 txt files
2001: kept 541 / 56,873 txt files
2002: kept 506 / 54,220 txt files
2003: kept 509 / 50,399 txt files
2004: kept 538 / 49,821 txt files
2005: kept 485 / 50,717 txt files
2006: kept 476 / 49,032 txt files
2007: kept 507 / 47,990 txt files
2008: kept 524 / 47,216 txt files
2009: kept 531 / 42,359 txt files
2010: kept 469 / 40,024 txt files
2011: kept 454 / 40,087 txt files
2012: kept 420 / 37,498 txt files
2013: kept 349 / 34,998 txt files
2014: kept 346 / 33,993 txt files
2015: kept 337 / 32,539 txt files
2016: kept 306 / 30,286 txt files
2017: kept 315 / 28,755 txt files
2018: kept 301 / 28,017 txt files
2019: kept 278 / 27,014 txt files
2