In [1]:
import pandas as pd
from pathlib import Path
import os

# Script merges processes datatables (i.e. `raw` datatabe + inferred metrics) merges them \& assigns the predicted scientific category

## 1. `meta_raw` Raw Meta table
String comparisons (BLEU, ROUGE, CAR, in particular METEOR) are compute-intensive and therefore split across machines. Merge the chunks back into a single DataFrame.

```
p / parser_output_with_metrics.csv
```
Stores this information.

In [2]:
# load tables
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database')
tables = [p / f for f in os.listdir(p) if f.startswith('parser_output_proc_') and f.endswith('4.csv')]
# sorted
tables = sorted(tables)

# load tables
df_list = []
for tab in tables:
    # read-in
    df_loc = pd.read_csv(tab, sep='|')

    # subset
    df_loc = df_loc.loc[:, df_loc.columns != 'Unnamed: 0']

    # into list
    df_list.append(df_loc)

# merge (handle header appropriately (only once in first row, nowwhere else)
df_proc_merged = pd.concat(df_list, ignore_index=True)

# load reference table
df_raw_ref = pd.read_csv(p / 'parser_output_raw.csv', sep='|')

# sort `df_proc_merged` by `path` (in order of `df_raw_ref['path']` 
df_proc_merged_sorted = df_proc_merged.set_index('path').loc[df_raw_ref['path']].reset_index()


In [3]:
# assess this frame
#df_proc_merged_sorted.tail()

Unnamed: 0,path,html,nougat,pymupdf,pypdf,marker,grobid,html_norm,nougat_norm,pymupdf_norm,...,car_pypdf,bleu_pypdf_norm,rouge_pypdf_norm,car_pypdf_norm,bleu_marker,rouge_marker,car_marker,bleu_marker_norm,rouge_marker_norm,car_marker_norm
23393,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_2_9_-_8...,This file contains Supplementary Tables 1-11.,[MISSING_PAGE_FAIL:1]\n\n[MISSING_PAGE_FAIL:2]...,Methyl-reducing methanogenesis by a \nthermoph...,,Accelerated Article Preview\n\n# Methyl-Reduci...,This is a PDF file of a peer-reviewed paper th...,this file contains supplementary tables 111,missingpagefail1 missingpagefail2 and the coms...,methylreducing methanogenesis by a thermophili...,...,0.0,0.0,0.0,0.0,1.6e-05,0.000301,0.001023,2.2e-05,0.0,0.00108
23394,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_3_7_-_8...,,"## References\n\n* [1] E. C. Matthews, A. L. C...",A temperate super-Jupiter imaged with JWST \ni...,,,This is a PDF file of a peer-reviewed paper th...,,references 1 e c matthews a l carter p pathak ...,a temperate superjupiter imaged with jwst in t...,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
23395,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_4_9_-_4...,This file contains Supplementary discussion th...,## Parture\n\n### Accelerated ArticlePreview\n...,Spillover of highly pathogenic avian \ninfluen...,,# Accelerated Article Preview Spillover Of Hig...,This is a PDF file of a peer-reviewed paper th...,this file contains supplementary discussion th...,parture accelerated articlepreview spillover o...,spillover of highly pathogenic avian influenza...,...,0.0,0.0,0.0,0.0,0.002704,0.011327,0.021765,0.002623,0.011543,0.029316
23396,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_6_0_-_9...,,[MISSING_PAGE_EMPTY:52]\n\nSpectroscopic confi...,Spectroscopic confirmation of two luminous \ng...,,# Accelerated Article Preview Spectroscopic Co...,This is a PDF file of a peer-reviewed paper th...,,missingpageempty52 spectroscopic confirmation ...,spectroscopic confirmation of two luminous gal...,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23397,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_7_4_-_3...,This file contains Supplementary Tables 1-3 an...,"## References\n\n* [1] Nathaniet Burman, Svett...",A virally-encoded tRNA neutralizes the \nPARIS...,,# Accelerated Article Preview A Virally-Encode...,This is a PDF file of a peer-reviewed paper th...,this file contains supplementary tables 13 and...,references 1 nathaniet burman svettana belukhi...,a virallyencoded trna neutralizes the paris an...,...,0.0,0.0,0.0,0.0,0.002182,0.00866,0.014244,0.00215,0.008421,0.016164


In [4]:
# store
#df_proc_merged_sorted.to_csv(p / 'parser_output_with_metrics.csv', sep='|', index=False)

# 2. `proc_meta` : Merge predicted scientific categories
Infer the scientific domain to allow more fine-grained statements on parser quality.

```
p / meta_proc_table.csv
```

contains all the information `meta_raw_table.csv` has but additionally the inferred scientific (sub-)category.

In [8]:
# load meta_raw frame
df_meta_raw = pd.read_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/meta_raw_table.csv', sep='|')

In [16]:
# load tables
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/predicted_categories/')
tables = [p / f for f in os.listdir(p) if f.startswith('df_pred_') and f.endswith('.csv')]
# sorted
tables = sorted(tables)

# load tables
df_list = []
for tab in tables:
    # read-in
    df_loc = pd.read_csv(tab, sep=',')

    # subset
    df_loc = df_loc.loc[:, df_loc.columns != 'Unnamed: 0']

    # into list
    df_list.append(df_loc)

# merge (handle header appropriately (only once in first row, nowwhere else)
df_pred_merged = pd.concat(df_list, ignore_index=True)

# rename columns
df_pred_merged = df_pred_merged.rename(columns={'predicted_category' : 'category', 'predicted_subcategory' : 'subcategory'})

# merge
df_merged = pd.merge(df_meta_raw, 
                     df_pred_merged[['path', 'category', 'subcategory']], 
                     on='path', 
                     how='left')

# store
#df_merged.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/meta_proc_table.csv', sep='|', index=None)

## 3. Assemble Analytics Table
Table that allows relating PDF meta data and inferred metrics (BLEU, etc.).
Merge `meta` and `metrics` table into a joint performance table.

Merges `meta_proc_table.csv` (for detailed meta information incl. scientific category) and analytics from `parser_output_with_metrics.csv` (ROUGE, etc.)

In [22]:
# Processed (inferred category) meta table
df_meta = pd.read_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/meta_proc_table.csv', sep='|')

# Text w/ Tables
df_metrics = pd.read_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_output_with_metrics.csv', sep='|') 

In [39]:
# merge
df_analytics = pd.merge(df_meta, df_metrics, on='path', how='inner')

In [42]:
# store
#df_analytics.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_meta_and_metrics.csv', sep='|')