In [None]:
import pandas as pd
import zipfile
import os
import shutil
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

from utils import zip_files, remove_text_layer, simulated_scanned_effect

In [1]:
!scp 1536_compressed/10.1101_2023.04.07.536049.pdf test.pdf

# Create dataset of clean and "dirty" PDFs

## 1. Transfer clean PDFs

In [None]:
%%time

# source
p_joint = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint/')

# load 1,536 (small)
df_1536 = pd.read_csv('./testset_1536/df_1536.csv', sep='|')

# load 10,240 (big)
df_10240 = pd.read_csv('./testset_10240/df_10240.csv', sep='|')

# p_destination
p_pdf_dsets = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets')

# Function to copy a single file
def copy_file(src, dest_dir):
    dest_path = dest_dir / src.name  # Create the destination path
    shutil.copy2(src, dest_path)     # Copy the file (with metadata)
    return dest_path

# Loop over both datasets and directories
for df_, subDir in zip([df_1536, df_10240], ['n_1536', 'n_10_240']):
    # Load files
    pdf_files = [p_joint / f for f in df_['path']]
    
    # Ensure the destination directory exists
    dest_dir = p_pdf_dsets / subDir
    dest_dir.mkdir(parents=True, exist_ok=True)
    
    # Parallel file transfer with 8 workers
    with ProcessPoolExecutor(max_workers=8) as executor:
        # Submit tasks to copy each file in parallel
        results = list(executor.map(copy_file, pdf_files, [dest_dir]*len(pdf_files)))

    # Optionally print or log the results (paths of copied files)
    print(f"Copied {len(results)} files to {dest_dir}")

In [None]:
len(df_1536), len(df_10240)

## Put (clean) PDFs into ZIPs

In [None]:
# p_destination
p_pdf_dsets = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf')
p_zip_destination = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/zip')

# Process subdirectories and PDFs
for subDir in ['n_1536', 'n_10_240']:
    # Create the destination subdirectory in the zip destination path
    dest_dir = p_zip_destination / subDir
    dest_dir.mkdir(parents=True, exist_ok=True)

    # All paths to the PDFs in the current subdirectory
    pdf_file_paths = [p_pdf_dsets / subDir / f for f in os.listdir(p_pdf_dsets / subDir) if f.endswith('.pdf')]

    # Group PDFs into batches of 256
    batch_size = 256
    for i in range(0, len(pdf_file_paths), batch_size):
        batch_files = pdf_file_paths[i:i + batch_size]

        # Create a zip file for each batch
        zip_filename = f'b{i//batch_size + 1:06}.zip'  # Zip file name, e.g., b000001.zip
        zip_path = dest_dir / zip_filename

        # Writing files to the zip
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for pdf_file in batch_files:
                zipf.write(pdf_file, arcname=pdf_file.name)  # Add each file to the zip

        print(f"Created zip: {zip_path} with {len(batch_files)} files")

# Manipulate PDFs

Source (original PDFs):
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf_orig/n_1536
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf_orig/n_10_240
```

Keep track of manipulation via table.

Destination
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf_orig/n_1536
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf_orig/n_10_240
```



## 2. Put (manipulated) PDFs into ZIPs

In [None]:
# source
p_joint = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint/')

# load 1,536 (small)
df_1536 = pd.read_csv('./testset_1536/df_1536.csv', sep='|')

# load 10,240 (big)
df_10240 = pd.read_csv('./testset_10240/df_10240.csv', sep='|')


In [None]:
df_1536['transformed']

In [None]:
# p_destination
p_pdf_dsets = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/parsing_datasets/pdf')