In [1]:
import csv 
import os 
import re 
import hashlib

In [2]:
def safe_filename(name: str) -> str:
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    # Replace & with 'and'
    name = name.replace("&", "and")
    # Remove unsafe characters (anything not alnum, underscore, dash, or dot)
    name = re.sub(r'[^A-Za-z0-9_.\-]', '_', name)

    # If the name is too long, truncate and add a hash suffix
    if len(name) > 255:
        hash_suffix = hashlib.md5(name.encode("utf-8")).hexdigest()[:8]
        truncated = name[:246]  # 246 + 9 = 255
        name = f"{truncated}_{hash_suffix}"

    return name

In [7]:
def chunk_csv_by_pdf(input_file: str, output_dir: str, progress_interval: int = 10000):
    os.makedirs(output_dir, exist_ok=True)

    writers = {}
    files = {}
    row_count = 0

    with open(input_file, "r", encoding="utf-8-sig") as infile:
        reader = csv.DictReader(infile)
        headers = reader.fieldnames

        for row in reader:
            row_count += 1
            pdf_name = row["pdf_name"]
            safe_name = safe_filename(pdf_name)
            output_file = os.path.join(output_dir, f"{safe_name}.csv")

            # If this pdf_name hasn't been seen before, create a new file + writer
            if safe_name not in writers:
                f = open(output_file, "w", newline="", encoding="utf-8")
                writer = csv.DictWriter(f, fieldnames=headers)
                writer.writeheader()
                writers[safe_name] = writer
                files[safe_name] = f

            # Write the row to the appropriate file
            writers[safe_name].writerow(row)

            # Progress indicator
            if row_count % progress_interval == 0:
                print(f"Processed {row_count} rows...")

    # Close all files
    for f in files.values():
        f.close()

    print(f"Finished splitting {input_file} into {len(files)} files in {output_dir}")
    print(f"Total rows processed: {row_count}")

In [8]:
# Example usage:
input_file = r"C:\Users\bhargav\Downloads\deep-past-initiative-machine-translation\publications.csv"
chunk_csv_by_pdf(input_file, r"C:\chunks_by_pdf")


Processed 10000 rows...
Processed 20000 rows...
Processed 30000 rows...
Processed 40000 rows...
Processed 50000 rows...
Processed 60000 rows...
Processed 70000 rows...
Processed 80000 rows...
Processed 90000 rows...
Processed 100000 rows...
Processed 110000 rows...
Processed 120000 rows...
Processed 130000 rows...
Processed 140000 rows...
Processed 150000 rows...
Processed 160000 rows...
Processed 170000 rows...
Processed 180000 rows...
Processed 190000 rows...
Processed 200000 rows...
Processed 210000 rows...
Finished splitting C:\Users\bhargav\Downloads\deep-past-initiative-machine-translation\publications.csv into 952 files in C:\chunks_by_pdf
Total rows processed: 216602


In [9]:
import os
import shutil

def split_into_folders(base_dir: str, max_size_mb: int = 80):
    max_size_bytes = max_size_mb * 1024 * 1024

    # List all CSV files in the directory
    files = [f for f in os.listdir(base_dir) if f.lower().endswith(".csv")]

    folder_index = 1
    current_folder = os.path.join(base_dir, f"batch_{folder_index}")
    os.makedirs(current_folder, exist_ok=True)
    current_size = 0

    for filename in files:
        file_path = os.path.join(base_dir, filename)
        file_size = os.path.getsize(file_path)

        # If adding this file would exceed the limit, start a new folder
        if current_size + file_size > max_size_bytes:
            folder_index += 1
            current_folder = os.path.join(base_dir, f"batch_{folder_index}")
            os.makedirs(current_folder, exist_ok=True)
            current_size = 0

        # Move the file into the current folder
        shutil.move(file_path, os.path.join(current_folder, filename))
        current_size += file_size

    print(f"Finished splitting into {folder_index} folders (max {max_size_mb} MB each).")

In [10]:
# Run it on your path:
split_into_folders(r"C:\chunks_by_pdf", 80)

Finished splitting into 8 folders (max 80 MB each).


In [11]:
import os
import csv

def merge_csvs_in_batches(base_dir: str):
    # Find all batch folders inside base_dir
    batch_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f)) and f.startswith("batch_")]

    for batch in batch_folders:
        batch_path = os.path.join(base_dir, batch)
        output_file = os.path.join(base_dir, f"{batch}_merged.csv")

        # Collect all CSV files in this batch folder
        csv_files = [f for f in os.listdir(batch_path) if f.lower().endswith(".csv")]

        if not csv_files:
            continue

        print(f"Merging {len(csv_files)} files in {batch} -> {output_file}")

        # Open the output file once
        with open(output_file, "w", newline="", encoding="utf-8") as outfile:
            writer = None

            for i, filename in enumerate(csv_files):
                file_path = os.path.join(batch_path, filename)
                with open(file_path, "r", encoding="utf-8") as infile:
                    reader = csv.reader(infile)
                    headers = next(reader)

                    # Initialize writer with headers from the first file
                    if writer is None:
                        writer = csv.writer(outfile)
                        writer.writerow(headers)

                    # Skip headers for subsequent files
                    for row in reader:
                        writer.writerow(row)

        print(f"Finished {batch}: merged into {output_file}")

In [12]:
# Example usage:
merge_csvs_in_batches(r"C:\chunks_by_pdf")

Merging 119 files in batch_1 -> C:\chunks_by_pdf\batch_1_merged.csv
Finished batch_1: merged into C:\chunks_by_pdf\batch_1_merged.csv
Merging 80 files in batch_2 -> C:\chunks_by_pdf\batch_2_merged.csv
Finished batch_2: merged into C:\chunks_by_pdf\batch_2_merged.csv
Merging 46 files in batch_3 -> C:\chunks_by_pdf\batch_3_merged.csv
Finished batch_3: merged into C:\chunks_by_pdf\batch_3_merged.csv
Merging 128 files in batch_4 -> C:\chunks_by_pdf\batch_4_merged.csv
Finished batch_4: merged into C:\chunks_by_pdf\batch_4_merged.csv
Merging 226 files in batch_5 -> C:\chunks_by_pdf\batch_5_merged.csv
Finished batch_5: merged into C:\chunks_by_pdf\batch_5_merged.csv
Merging 161 files in batch_6 -> C:\chunks_by_pdf\batch_6_merged.csv
Finished batch_6: merged into C:\chunks_by_pdf\batch_6_merged.csv
Merging 77 files in batch_7 -> C:\chunks_by_pdf\batch_7_merged.csv
Finished batch_7: merged into C:\chunks_by_pdf\batch_7_merged.csv
Merging 115 files in batch_8 -> C:\chunks_by_pdf\batch_8_merged.c