In [3]:
from pathlib import Path
import os
import yaml
import shutil
import uuid
import concurrent.futures
import zipfile
import random
import string

## Script scans for all PDFs in `p_data_root_src`, puts them into ZIPs of `block size=batch size` and stores them - grouped by `train`/`test`/`val`

### Goals:
- enables experiments to tune throughput (initially, for `PyMuPDF`)
- generated datasets all contain the $N=23,395$ PDFs but split across different block sizes/batch sizes ($k$)
    - $k=100 \rightarrow m=235$ zips
    - $k=256 \rightarrow m=93$ zips
    - $k=512 \rightarrow m=47$ zips
    - $k=1000 \rightarrow m=24$ zips 

In [4]:
p = Path('/home/siebenschuh/Projects/dataprep/code/DPO/meta_split/pymupdf.yaml')
p_data_root_src = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint')
p_data_root_dst = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100')

In [9]:
# function to generate a unique zip file name using uuid
def generate_unique_name(suffix=''):
    assert suffix!='', "cannot be empty string: ``"
    s = str(uuid.uuid4())
    
    # return the UUID with the suffix appended
    return f"{str(s[:-(len(suffix)+1)])}-{suffix}.zip"

# function to zip a batch of files
def zip_files(batch, batch_index,suffix):
    try:
        # Generate a unique zip file name
        zip_name = generate_unique_name(suffix)
        zip_path = p_data_root_dst / zip_name

        # Create a zip file and add the files to it
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            for file_path in batch:
                zipf.write(file_path, arcname=file_path.name)

        return f"Created zip file: {zip_path} with {len(batch)} files"
    except Exception as e:
        return f"Failed to create zip file for batch {batch_index}: {e}"

# function to copy and zip files in parallel
def copy_and_zip_files(train_paths, batch_size, num_processes=4, suffix=''):
    # Create batches of files
    batches = [train_paths[i:i + batch_size] for i in range(0, len(train_paths), batch_size)]

    # Use ProcessPoolExecutor for parallel processing
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_processes) as executor:
        # Map the list of batches to the zip function
        results = list(executor.map(zip_files, batches, range(len(batches)), [suffix]*len(batches)))

    # Output results
    for result in results:
        print(result)

In [10]:
%%time

suffix = 'test'

# load
with open(p, 'r') as f:
    d = yaml.safe_load(f)
    
# collect
file_paths = []
for f_path in d[suffix]:
    if os.path.isfile(p_data_root_src / f_path):
        file_paths.append(p_data_root_src / f_path)

CPU times: user 925 ms, sys: 256 ms, total: 1.18 s
Wall time: 2.64 s


In [11]:
len(file_paths)

2882

In [12]:
# call the function to copy and zip files
copy_and_zip_files(file_paths, batch_size=100, num_processes=4, suffix=suffix)

Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/ae4a1878-9b9d-49ee-8350-f16d466-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/e0635156-4e4e-45f7-bdcd-d0060ca-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/72023ac9-f673-4c3f-a498-151dc22-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/0b3aeefa-9340-49ef-92d9-a56f4bd-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/35ae1924-aac9-4abc-adc5-6143b8c-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/benchmark_data/block_100/f22b87fe-2607-4bde-abda-d21c3e2-test.zip with 100 files
Created zip file: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/ben