In [30]:
import os
from pathlib import Path
import pandas as pd
import shutil
import time
import uuid
import pymupdf
import zipfile
from concurrent.futures import ThreadPoolExecutor

from utils import remove_text_layer, simulated_scanned_effect, zip_files, text_scrambler

In [31]:
# root (PDFs)
p_root = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint')
assert p_root.is_dir(), "Root/source dir of pdfs does not exist"

# dst
p_dst = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/pdf')
assert p_dst.is_dir(), "Destination dir for pdfs does not exist"

# import df
p_df = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_mod_10240.csv')
df_10240 = pd.read_csv(p_df, sep='|')

# unmanipulated
df_unmanipulated = df_10240[df_10240['manipulated']==0]

# unmanipulated
df_manip = df_10240[df_10240['manipulated']==1]

# EDIT: SAMPLE HALF of un-manipulated PDFs -> 30% manipulation rate (15% no text, 15% OCR-ed text)

# Import df
p_df = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_mod_10240.csv')
df_10240 = pd.read_csv(p_df, sep='|')

# Unmanipulated DataFrame
df_unmanipulated = df_10240[df_10240['manipulated'] == 0]

# Sample half of the rows randomly
df_unmanip_sampled = df_unmanipulated.sample(frac=0.75, random_state=42)

# Only take the first 3584 rows
df_unmanip_sampled = df_unmanip_sampled.head(3584)

In [32]:
len(df_unmanip_sampled), 10_240 * 0.35

(3584, 3584.0)

In [33]:
df_unmanipulated = df_unmanip_sampled

## Copy PDFs

#### 1. 8704 un-manipulated
Transfer in parallel "as is"

In [34]:
copy_UNmanipulated = True
copy_ToBeManipulated = False

# copy all not-to-be manipulated files directly to destination 
if copy_UNmanipulated:
    # copy all these files from here into `p_dst`
    all_UNmanipulated_pdf_file_paths = [(p_root / f) for f in df_unmanipulated['path'] if (p_root / f).is_file()]
    
    # Function to copy a single file
    def copy_file(src):
        dst = p_dst / src.name  # Destination path
        shutil.copy2(src, dst)  # Copy with metadata
    
    # Use ThreadPoolExecutor to copy files in parallel
    with ThreadPoolExecutor() as executor:
        executor.map(copy_file, all_UNmanipulated_pdf_file_paths)

    # msg
    print(f'Done. {len(os.listdir(p_dst))} PDFs in dst path')


# to be manipulated: copy from `joint` path to local directory (before further post-processing)
if copy_ToBeManipulated:
    # copy all these files from here into `p_dst`
    to_be_manipulated_pdf_file_paths = [(p_root / f) for f in df_manip['path'] if (p_root / f).is_file()]
    
    # Function to copy a single file
    def copy_file(src):
        dst = Path('./1536_original') / src.name  # Destination path
        shutil.copy2(src, dst)  # Copy with metadata
    
    # Use ThreadPoolExecutor to copy files in parallel
    with ThreadPoolExecutor() as executor:
        executor.map(copy_file, to_be_manipulated_pdf_file_paths)

    # msg
    print(f'Done. {len(os.listdir(p_dst))} PDFs in dst path {p_dst}')

Done. 3584 PDFs in dst path


#### 2. 1536 manipulated 
Copy files one-by-one (post manipulation procedure)

# 3. Create ZIps

In [7]:
# source & destination path
p_pdf = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/pdf/')
p_zip = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/zip/')

# for logging
zip_dict = {}

# Function to zip files
def zip_files(file_paths, output_zip_path):
    with zipfile.ZipFile(output_zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        for file_path in file_paths:
            # Add file to ZIP archive with its basename
            zipf.write(file_path, arcname=Path(file_path).name)

In [10]:
%%time

#make this a program:(batch_size:int)

# List of batch sizes
#batch_sizes = [16, 64, 128]
assert batch_size in batch_sizes, "One of those batch sizes"

# source & destination path
p_pdf = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/pdf/')
p_zip = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/zip/')

# for logging
zip_dict = {}

# Function to zip files
def zip_files(file_paths, output_zip_path):
    with zipfile.ZipFile(output_zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        for file_path in file_paths:
            # Add file to ZIP archive with its basename
            zipf.write(file_path, arcname=Path(file_path).name)

# Assuming p_zip is the base directory where the ZIPs will be stored
# For example:
p_zip.mkdir(parents=True, exist_ok=True)


# Your list of PDF file paths (replace with your actual list)
pdf_file_paths = list(Path(p_pdf).glob('*.pdf'))

batch_size
# Destination directory for this batch size
p_dst = p_zip / f'b{batch_size}'
p_dst.mkdir(parents=True, exist_ok=True)

# Split list into list of lists where each list has length `batch_size`
# If the total number of PDFs is not a multiple of batch_size, the last list may be shorter
list_of_lists = [
    pdf_file_paths[i:i + batch_size]
    for i in range(0, len(pdf_file_paths), batch_size)
]

# (no duplicates yet)
for i, batch_file_paths in enumerate(list_of_lists):
    # Generate random UUID string of length 10
    # Construct the ZIP file name
    zip_filename = (
        f"bs{str(batch_size).zfill(4)}"
        f"id{str(i).zfill(3)}-{str(k).zfill(2)}.zip"
    )
    # Output ZIP path
    output_zip_path = p_dst / zip_filename
    # Call zip_files function to create the ZIP file
    print('Writing...')
    zip_files(batch_file_paths, output_zip_path)
    print('Done w...')
    # Update the dictionary
    zip_dict[str(output_zip_path)] = batch_file_paths

# store
# store zip_dict as f`zip_dict_{batchsize}.json`


Writing...
Done w...
Writing...
Done w...
Writing...


KeyboardInterrupt: 

In [9]:
p_dst / zip_filename

PosixPath('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/zip/b16/bs0016id005-00.zip')