In [1]:
import pandas as pd
import glob
import math
import os

# Parameters
MAX_ROWS_PER_FILE = 1_000_000
OUTPUT_DIR = "../../raw_data/scrape_data"
FILE_PREFIX = "patents_foster_"
CENTURY_PREFIX = "19"

# Get all 19th-century files
csv_files_19th_century = glob.glob(os.path.join(OUTPUT_DIR, f"{FILE_PREFIX}{CENTURY_PREFIX}*.csv"))

# Define expected dtypes
dtypes = {
    "year": int,
    "country": str,
    "doc_number": str
}

# Load data
dfs = [pd.read_csv(file, dtype=dtypes) for file in csv_files_19th_century]
combined_df = pd.concat(dfs, ignore_index=True)

total_rows = len(combined_df)
print(f"Total rows: {total_rows}")

if total_rows <= MAX_ROWS_PER_FILE:
    # Save as single file
    out_name = os.path.join(OUTPUT_DIR, f"{FILE_PREFIX}1800-1899.csv")
    combined_df.to_csv(out_name, index=False)
    print(f"Combined file created: {out_name}")
else:
    # Split into equal chunks
    num_parts = math.ceil(total_rows / MAX_ROWS_PER_FILE)
    chunk_size = math.ceil(total_rows / num_parts)
    print(f"Splitting into {num_parts} files of ~{chunk_size} rows each")

    years = sorted(combined_df["year"].unique())
    min_year = min(years)
    max_year = max(years)
    years_per_chunk = math.ceil((max_year - min_year + 1) / num_parts)

    for i in range(num_parts):
        start_year = min_year + i * years_per_chunk
        end_year = min(start_year + years_per_chunk - 1, max_year)

        chunk_df = combined_df[(combined_df["year"] >= start_year) & (combined_df["year"] <= end_year)]

        out_name = os.path.join(OUTPUT_DIR, f"{FILE_PREFIX}{start_year}-{end_year}.csv")
        chunk_df.to_csv(out_name, index=False)
        print(f"Chunk {i+1}: {len(chunk_df)} rows → {out_name}")


Total rows: 2574380
Splitting into 3 files of ~858127 rows each
Chunk 1: 733014 rows → ../../raw_data/scrape_data\patents_foster_1900-1909.csv
Chunk 2: 824009 rows → ../../raw_data/scrape_data\patents_foster_1910-1919.csv
Chunk 3: 1017357 rows → ../../raw_data/scrape_data\patents_foster_1920-1928.csv
