In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import os, json
from matplotlib import pyplot as plt

## Crucial script to manage dataframes
- including parser output (and computed BLEU scores)
- merging inferred categories
- subset and store `parser_output` and `meta_and_metrics_only` dataframes

In [2]:
# Define the path to the CSV file
csv_path = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/db_pypdf_last_run.csv')

# Load the CSV file
df = pd.read_csv(csv_path, sep='|')

# Define the number of splits
num_splits = 5

# Split the dataframe into 5 pieces
split_dfs = np.array_split(df, num_splits)

# Save each split to a separate CSV file with headers
output_base = '/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/db_pypdf_last_run'
for i, split_df in enumerate(split_dfs):
    split_df.to_csv(f"{output_base}{i}.csv", sep='|', index=False)  # Save with the split number in the file name

  return bound(*args, **kwds)


# 2. Merge processed subframes (with `text` and care etc.) DFs back into one

In [2]:
# Save each split to a separate CSV file with headers (stupid error: ended up in pagewise)
output_base = '/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/db_pypdf_last_run_processed_'
df_list = []

num_splits = 5
for i in range(num_splits):
    df_loc = pd.read_csv(f"{output_base}{i}.csv", sep='|')  # Save with the split number in the file name
    df_list.append(df_loc)

In [3]:
# merge into one
df_merged = pd.concat(df_list, axis=0, ignore_index=True)

In [4]:
len(df_merged)

24968

In [5]:
# store 
df_merged.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/db_pypdf_last_run_processed.csv', sep='|', index=None)

## 3. Merge this df onto the previous one

In [6]:
%%time

p_output_and_metrics = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_output_with_metrics.csv')

df_om = pd.read_csv(p_output_and_metrics, sep='|')

CPU times: user 1min 53s, sys: 19.8 s, total: 2min 13s
Wall time: 2min 14s


In [7]:
len(df_om)

23398

## 4. `FINAL_parser_output_and_metrics.csv`
- corrected `pypdf` (that may be false) and includes new addition `tesseract`

In [2]:
# load tesseract/pypdf proc frame
# drop columns : ['Unnamed: 0', 'html'] from df_merged

# UNCOMMENT if `df_merged not computed a few cells up
#df_merged = pd.read_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/tesseract_and_pypdf_processed.csv', sep='|')
#df_merged = df_merged.drop(columns=['Unnamed: 0', 'html'])

In [6]:
# UNCOMMENT if df_om not loaded a few cells up

# load
#p_output_and_metrics = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_output_with_metrics.csv')
#df_om = pd.read_csv(p_output_and_metrics, sep='|')
#df_om = df_om.drop(columns=[c for c in df_om.columns if ('pypdf' in c) or ('html' in c)])

In [31]:
df_merged.columns

Index(['Unnamed: 0', 'path', 'html', 'pypdf', 'html_norm', 'pypdf_norm',
       'bleu_pypdf', 'rouge_pypdf', 'car_pypdf', 'bleu_pypdf_norm',
       'rouge_pypdf_norm', 'car_pypdf_norm'],
      dtype='object')

In [None]:
# may drop everything `pypdf`

In [32]:
# merge the two
df_combined = pd.merge(left=df_om[[c for c in df_om.columns if ('pypdf' not in c)]], 
                       right=df_merged[[c for c in df_merged.columns if ('html' not in c) and ('Unnamed' not in c)]], 
                       on='path', 
                       how='left')

In [33]:
df_combined.columns

Index(['path', 'html', 'nougat', 'pymupdf', 'marker', 'grobid', 'tesseract',
       'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm',
       'marker_norm', 'tesseract_norm', 'bleu_nougat', 'rouge_nougat',
       'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
       'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
       'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
       'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
       'rouge_grobid_norm', 'car_grobid_norm', 'bleu_marker', 'rouge_marker',
       'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm', 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
       'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm',
       'category', 'subcategory', 'pypdf', 'pypdf_norm', 'bleu_pypdf',
       'rouge_pypdf', 'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm',
       'car_pypdf_norm'],
      dtype='object')

In [36]:
# sort columns by order
df_combined = df_combined[['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid', 'tesseract',
       'html_norm', 'nougat_norm', 'pymupdf_norm', 'pypdf_norm', 'grobid_norm',
       'marker_norm', 'tesseract_norm', 'bleu_nougat', 'rouge_nougat',
       'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
       'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
       'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
       'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
       'rouge_grobid_norm', 'car_grobid_norm', 'bleu_marker', 'rouge_marker',
       'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm', 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
       'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm', 'bleu_pypdf',
       'rouge_pypdf', 'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm',
       'car_pypdf_norm', 'category', 'subcategory']]

In [37]:
df_combined.columns

Index(['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid',
       'tesseract', 'html_norm', 'nougat_norm', 'pymupdf_norm', 'pypdf_norm',
       'grobid_norm', 'marker_norm', 'tesseract_norm', 'bleu_nougat',
       'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
       'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
       'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
       'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
       'rouge_grobid_norm', 'car_grobid_norm', 'bleu_marker', 'rouge_marker',
       'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm', 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
       'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm',
       'bleu_pypdf', 'rouge_pypdf', 'car_pypdf', 'bleu_pypdf_norm',
       'rouge_pypdf_norm', 'car_pypdf_norm', 'category', 'subcategory'],
      dtype='object')

In [39]:
len(df_combined)

23398

In [40]:
%%time

# store this new DF: Will take 8min 19s (and produce a dataframe with the size of )
df_combined.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_FINAL_parser_output_and_metrics.csv', sep='|', index=None)


CPU times: user 5min 42s, sys: 38.7 s, total: 6min 20s
Wall time: 6min 25s


In [44]:
# Metrics and Meta only
df_metrics_n_meta = df_combined[['path', 'bleu_nougat', 'rouge_nougat', 'car_nougat', 
                                 'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm', 
                                 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
                                 'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
                                 'bleu_grobid', 'rouge_grobid', 'car_grobid', 
                                 'bleu_grobid_norm', 'rouge_grobid_norm', 'car_grobid_norm', 
                                 'bleu_marker', 'rouge_marker', 'car_marker', 
                                 'bleu_marker_norm', 'rouge_marker_norm', 'car_marker_norm', 
                                 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
                                 'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm', 
                                 'bleu_pypdf', 'rouge_pypdf', 'car_pypdf', 
                                 'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm', 
                                 'category', 'subcategory']]

In [45]:
df_metrics_n_meta.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_FINAL_meta_and_metrics_only.csv', sep='|', index=None)

## 4. Load inferred categories, merge them onto `metric_only` table

In [None]:
import sys

# Add the directory containing utils.py to the system path
sys.path.append(str(Path('../data_assembly/category_inference').resolve()))

# Import the methods from utils.py
from utils import get_HF_frame, get_GPT4_frame, get_frame

In [None]:
df_meta = get_frame()

In [None]:
# show columns
df_meta.columns

In [None]:
# merge meta onto df_combined
df_m = pd.merge(left=df_combined, 
                right=df_meta, 
                on='path', 
                how='left')

In [None]:
df_m.columns

In [28]:
# o
df_m.to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/FINAL_FINAL_parser_output_with_metrics.csv', sep='|', index=None)

In [29]:
# store meta & metrics only table
df_meta_metrics_only = df_m[['path', 
                             'bleu_nougat', 'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm',
                             'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf', 'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm', 
                             'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm', 'rouge_grobid_norm', 'car_grobid_norm', 
                             'bleu_marker', 'rouge_marker', 'car_marker', 'bleu_marker_norm', 'rouge_marker_norm', 'car_marker_norm', 
                             'bleu_tesseract', 'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm', 
                             'category', 'subcategory']]

# store
df_meta_metrics_only.to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/FINAL_FINAL_meta_and_metrics_only.csv', sep='|', index=None)

## 5. Load `df_10240` and `df_1536`

Create a modified version of `df_1536` called `df_mod_1536` that replaced text with either "", the tesseract, or the grobid parsed text.

Also, store the respective BLEU score (of `0`, `bleu_tesseract`, or `bleu_grobid`) if needed.

In [49]:
# paths to dfs
p_1536 = Path('/home/siebenschuh/Projects/dataprep/code/data_assembly/testset_1536/df_1536.csv')
p_10240 = Path('/home/siebenschuh/Projects/dataprep/code/data_assembly/testset_10240/df_10240.csv')

# laod DFs
df_1536  = pd.read_csv(p_1536, sep='|')
df_10240 = pd.read_csv(p_10240, sep='|')

# store in new location
df_1536.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_orig_1536.csv', index=None, sep='|')
df_10240.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_orig_10240.csv', index=None, sep='|')

In [13]:
# Set random seed for reproducibility
np.random.seed(42)

# Assume 'df_1536' and 'df_combined' DataFrames are defined
# by def, all entries in df_1536 are "manipulated"
df_1536['manipulated'] = 1

# Step 1: Create 'text_layer' column
df_1536['text_layer'] = np.where(
    df_1536['manipulated'] == 1,
    np.random.choice([0, 1], size=len(df_1536), p=[0.5, 0.5]),
    1  # Set to 1 when manipulated == 0
)

# Step 2: Create 'emb_txt_src' column
df_1536['emb_txt_src'] = np.where(
    df_1536['text_layer'] == 1,
    np.random.choice(['tesseract', 'grobid'], size=len(df_1536), p=[0.5, 0.5]),
    "-"  # No source when 'text_layer' is not 1
)

# Step 3: Create 'text' and 'text_bleu' columns with conditional assignments
def assign_text_and_bleu(row):
    # Case where 'manipulated' == 0
    if row['manipulated'] == 0:
        return row['html'], 0.0  # Default BLEU score as 0 for unmanipulated rows
    
    # Case where 'manipulated' == 1 and we need lookup in 'df_combined'
    elif row['manipulated'] == 1:
        # Find the corresponding row in df_combined with the same 'path'
        matching_row = df_combined[df_combined['path'] == row['path']]
        
        # Assign text and text_bleu based on 'emb_txt_src'
        if row['emb_txt_src'] == 'tesseract' and not matching_row.empty:
            return matching_row.iloc[0]['tesseract'], matching_row.iloc[0]['bleu_tesseract']
        elif row['emb_txt_src'] == 'grobid' and not matching_row.empty:
            return matching_row.iloc[0]['grobid'], matching_row.iloc[0]['bleu_grobid']
    
    # Default return if no conditions match
    return "-", 0.0

# Apply the function to each row, expanding into 'text' and 'text_bleu' columns
df_1536[['text', 'text_bleu']] = df_1536.apply(assign_text_and_bleu, axis=1, result_type="expand")

# Display the updated DataFrame
df_1536.head()


Unnamed: 0,path,publisher,class,subclass,class_src,subclass_src,subset,manipulated,text_layer,emb_txt_src,text,text_bleu
0,nature/pdf/n_a_t_u_r_e_2_5_5_1_1.pdf,Nature,Nature,-,-,-,test,1,0,-,-,0.0
1,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_7_0_2_-_8...,Nature,Nature,-,-,-,test,1,1,grobid,Groundwater is the most ubiquitous source of l...,0.183942
2,nature/pdf/n_a_t_u_r_e_2_6_0_0_2.pdf,Nature,Nature,-,-,-,test,1,1,tesseract,ARTICLE\n\n1doi:10.1038/nature26002\n\nPlacent...,0.434241
3,nature/pdf/n_a_t_u_r_e_1_3_8_1_2.pdf,Nature,Nature,-,-,-,test,1,1,grobid,"The monarch butterfly, Danaus plexippus, is fa...",0.041296
4,nature/pdf/n_a_t_u_r_e_1_3_5_6_8.pdf,Nature,Nature,-,-,-,test,1,0,-,-,0.0


In [48]:
# store
df_1536.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_mod_1536.csv', sep='|', index=None)

In [17]:
np.unique(df_1536['publisher'], return_counts=True)

(array(['Nature', 'arxiv', 'biorxiv', 'bmc', 'mdpi', 'medrxiv'],
       dtype=object),
 array([256, 256, 256, 256, 256, 256]))

In [20]:
df_10240.columns

Index(['path', 'publisher', 'class', 'subclass', 'class_src', 'subclass_src',
       'subset'],
      dtype='object')

In [23]:
# never manipulated (except the rows that will be replaced by 1536)
df_10240['manipulated'] = 0
df_10240['text_layer'] = 1
df_10240['emb_txt_src'] = 'pymupdf'

# Merge df_10240 with df_om based on 'path' to get 'text' and 'text_bleu' columns
df_10240 = df_10240.merge(df_om[['path', 'pymupdf', 'bleu_pymupdf']], on='path', how='left')

# Rename the columns after the merge
df_10240 = df_10240.rename(columns={'pymupdf': 'text', 'bleu_pymupdf': 'text_bleu'})

# Display the updated df_10240
df_10240.head()

Unnamed: 0,path,publisher,class,subclass,class_src,subclass_src,subset,manipulated,text_layer,emb_txt_src,text,text_bleu
0,nature/pdf/n_a_t_u_r_e_0_9_4_2_8.pdf,Nature,Nature,-,-,-,val,0,1,pymupdf,ARTICLE\ndoi:10.1038/nature09428\nAn unprecede...,0.492119
1,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_3_1_3_-_9...,Nature,Nature,-,-,-,test,0,1,pymupdf,260 Nature Vol 611 10 November 2022\n...,0.490592
2,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_3_9_1_-_9...,Nature,Nature,-,-,-,val,0,1,pymupdf,Nature Vol 611 17 November 2022 507\n...,0.61682
3,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_4_8_2_-_7...,Nature,Nature,-,-,-,test,0,1,pymupdf,540 Nature Vol 612 15 December 2022\n...,0.550893
4,nature/pdf/n_a_t_u_r_e_0_4_1_7_7.pdf,Nature,Nature,-,-,-,test,0,1,pymupdf,© 2006 Nature Publishing Group \nA quantitativ...,0.637462


In [41]:
common_paths = df_10240['path'].isin(df_1536['path'])

# Remove matching rows in df_10240
df_mod_10240 = df_10240[~common_paths]

# Concatenate df_10240 with df_1536 to replace the removed rows
df_mod_10240 = pd.concat([df_mod_10240, df_1536], ignore_index=True)

# Display the updated DataFrame
df_mod_10240.head()


Unnamed: 0,path,publisher,class,subclass,class_src,subclass_src,subset,manipulated,text_layer,emb_txt_src,text,text_bleu
0,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_3_9_1_-_9...,Nature,Nature,-,-,-,val,0,1,pymupdf,Nature Vol 611 17 November 2022 507\n...,0.61682
1,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_4_8_2_-_7...,Nature,Nature,-,-,-,test,0,1,pymupdf,540 Nature Vol 612 15 December 2022\n...,0.550893
2,nature/pdf/n_a_t_u_r_e_0_4_1_7_7.pdf,Nature,Nature,-,-,-,test,0,1,pymupdf,© 2006 Nature Publishing Group \nA quantitativ...,0.637462
3,nature/pdf/s_4_1_5_8_6_-_0_2_2_-_0_5_5_3_4_-_y...,Nature,Nature,-,-,-,test,0,1,pymupdf,120 Nature Vol 613 5 January 2023\nAr...,0.588498
4,nature/pdf/s_4_1_5_8_6_-_0_1_8_-_0_3_8_2_-_x.pdf,Nature,Nature,-,-,-,val,0,1,pymupdf,Article\nhttps://doi.org/10.1038/s41586-018-03...,0.530235


In [47]:
df_mod_10240.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_mod_10240.csv', index=None, sep='|')

In [42]:
len(df_mod_10240), len(df_mod_10240)

(10240, 10240)

In [46]:
df_mod_10240['text_bleu'].mean()

np.float64(0.3901893397203056)

In [45]:
np.unique(df_mod_)

np.int64(1536)

In [28]:
df_10240['text_bleu'].mean()

np.float64(0.43186784416856616)

In [29]:
df_1536.columns

Index(['path', 'publisher', 'class', 'subclass', 'class_src', 'subclass_src',
       'subset', 'manipulated', 'text_layer', 'emb_txt_src', 'text',
       'text_bleu'],
      dtype='object')

In [30]:
df_om.columns

Index(['path', 'nougat', 'pymupdf', 'marker', 'grobid', 'nougat_norm',
       'pymupdf_norm', 'grobid_norm', 'marker_norm', 'bleu_nougat',
       'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
       'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
       'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
       'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
       'rouge_grobid_norm', 'car_grobid_norm', 'bleu_marker', 'rouge_marker',
       'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm'],
      dtype='object')