In [1]:
import pandas as pd
import os
import numpy as np
from pathlib import Path

from category_inference.utils import get_HF_frame, get_GPT4_frame, get_frame

## 1. Combine Legacy Frames (HF-inferred, ensemble)

Parser output with text (and metrics incl. inferred category)
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_output_with_metrics.csv
```

Parser output __without__ text (and metrics incl. inferred category)
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_metrics_only.csv
```


In [39]:
df_meta = get_frame()

Modes of GPT4 predictions
Modes of HF predictions


## 2. Add `tesseract` and `pypdf` back on

#### 2.1 Parser text + BLEU/ROUGE/CAR
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_output.csv
```

#### 2.2 Parser text + BLEU/ROUGE/CAR + inferred categories
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_output_with_metrics.csv
```

#### 2.3 ~Parser text +~ BLEU/ROUGE/CAR + inferred categories
```
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_metrics_only.csv
```

In [16]:
# tesseract & pypdf
df_tr_n_pp = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/tesseract_and_pypdf_processed.csv', sep='|')

In [18]:
len(df_tr_n_pp), df_tr_n_pp.columns

(24845,
 Index(['Unnamed: 0', 'path', 'html', 'tesseract', 'pypdf', 'html_norm',
        'tesseract_norm', 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
        'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm'],
       dtype='object'))

In [19]:
# all others (incl. LEGACY pypdf)
df_parser_txt_n_metrics = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_output_with_metrics.csv', sep='|')

In [20]:
len(df_parser_txt_n_metrics), df_parser_txt_n_metrics.columns

(23398,
 Index(['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid',
        'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm', 'pypdf_norm',
        'marker_norm', 'bleu_nougat', 'rouge_nougat', 'car_nougat',
        'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm',
        'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf', 'bleu_pymupdf_norm',
        'rouge_pymupdf_norm', 'car_pymupdf_norm', 'bleu_grobid', 'rouge_grobid',
        'car_grobid', 'bleu_grobid_norm', 'rouge_grobid_norm',
        'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf', 'car_pypdf',
        'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm', 'bleu_marker',
        'rouge_marker', 'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
        'car_marker_norm'],
       dtype='object'))

In [21]:
# load frame of inferred categories (again)
df_categories = get_frame()

Modes of GPT4 predictions
Modes of HF predictions


In [22]:
len(df_categories), df_categories.columns

(22596, Index(['path', 'category', 'subcategory'], dtype='object'))

In [30]:
# Parser output
new_parser_output_left = df_parser_txt_n_metrics[['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid',
        'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm', 'pypdf_norm',
        'marker_norm', 'bleu_nougat', 'rouge_nougat', 'car_nougat',
        'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm',
        'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf', 'bleu_pymupdf_norm',
        'rouge_pymupdf_norm', 'car_pymupdf_norm', 'bleu_grobid', 'rouge_grobid',
        'car_grobid', 'bleu_grobid_norm', 'rouge_grobid_norm',
        'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf', 'car_pypdf',
        'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm', 'bleu_marker',
        'rouge_marker', 'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
        'car_marker_norm']]

In [31]:
len(new_parser_output_left), len(set(new_parser_output_left['path']))

(23398, 23397)

In [28]:
new_parser_output_right = df_tr_n_pp[['path', 'tesseract', 
        'tesseract_norm', 'bleu_tesseract', 'rouge_tesseract', 'car_tesseract',
        'bleu_tesseract_norm', 'rouge_tesseract_norm', 'car_tesseract_norm']]

In [32]:
len(new_parser_output_right), len(set(new_parser_output_right['path']))

(24845, 24845)

In [33]:
df = pd.merge(left=new_parser_output_left,
         right=new_parser_output_right,
         on='path',
         how='inner')

In [36]:
df.columns

Index(['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid',
       'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm', 'pypdf_norm',
       'marker_norm', 'bleu_nougat', 'rouge_nougat', 'car_nougat',
       'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm',
       'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf', 'bleu_pymupdf_norm',
       'rouge_pymupdf_norm', 'car_pymupdf_norm', 'bleu_grobid', 'rouge_grobid',
       'car_grobid', 'bleu_grobid_norm', 'rouge_grobid_norm',
       'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf', 'car_pypdf',
       'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm', 'bleu_marker',
       'rouge_marker', 'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm', 'tesseract', 'tesseract_norm', 'bleu_tesseract',
       'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm',
       'rouge_tesseract_norm', 'car_tesseract_norm'],
      dtype='object')

In [38]:
df = df[['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid', 'tesseract',
       'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm', 'pypdf_norm',
       'marker_norm', 'tesseract_norm', 'bleu_nougat', 'rouge_nougat', 'car_nougat',
       'bleu_nougat_norm', 'rouge_nougat_norm', 'car_nougat_norm',
       'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf', 'bleu_pymupdf_norm',
       'rouge_pymupdf_norm', 'car_pymupdf_norm', 'bleu_grobid', 'rouge_grobid',
       'car_grobid', 'bleu_grobid_norm', 'rouge_grobid_norm',
       'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf', 'car_pypdf',
       'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm', 'bleu_marker',
       'rouge_marker', 'car_marker', 'bleu_marker_norm', 'rouge_marker_norm',
       'car_marker_norm', 'bleu_tesseract',
       'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm',
       'rouge_tesseract_norm', 'car_tesseract_norm']]

In [40]:
df_meta.columns

Index(['path', 'category', 'subcategory'], dtype='object')

In [41]:
df_m = pd.merge(left=df, right=df_meta, on='path', how='left')

In [44]:
df_m.columns, len(df_m)

(Index(['path', 'html', 'nougat', 'pymupdf', 'pypdf', 'marker', 'grobid',
        'tesseract', 'html_norm', 'nougat_norm', 'pymupdf_norm', 'grobid_norm',
        'pypdf_norm', 'marker_norm', 'tesseract_norm', 'bleu_nougat',
        'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
        'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
        'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
        'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
        'rouge_grobid_norm', 'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf',
        'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm',
        'bleu_marker', 'rouge_marker', 'car_marker', 'bleu_marker_norm',
        'rouge_marker_norm', 'car_marker_norm', 'bleu_tesseract',
        'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm',
        'rouge_tesseract_norm', 'car_tesseract_norm', 'category',
        'subcategory'],
       dtype='object'),
 23

In [45]:
df_m.to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_output_with_metrics.csv', sep='|', index=None)

In [46]:
df_reduced = df_m[['path', 'bleu_nougat',
        'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
        'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
        'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
        'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
        'rouge_grobid_norm', 'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf',
        'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm',
        'bleu_marker', 'rouge_marker', 'car_marker', 'bleu_marker_norm',
        'rouge_marker_norm', 'car_marker_norm', 'bleu_tesseract',
        'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm',
        'rouge_tesseract_norm', 'car_tesseract_norm', 'category',
        'subcategory']]

In [47]:
%%time
df_reduced.to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_TABLES/parser_metrics_only.csv', sep='|', index=None)

CPU times: user 1.1 s, sys: 28.9 ms, total: 1.13 s
Wall time: 1.14 s


In [48]:
df_reduced

Unnamed: 0,path,bleu_nougat,rouge_nougat,car_nougat,bleu_nougat_norm,rouge_nougat_norm,car_nougat_norm,bleu_pymupdf,rouge_pymupdf,car_pymupdf,...,rouge_marker_norm,car_marker_norm,bleu_tesseract,rouge_tesseract,car_tesseract,bleu_tesseract_norm,rouge_tesseract_norm,car_tesseract_norm,category,subcategory
0,arxiv/pdf/2207.11282v4.pdf,0.636972,0.854319,0.832541,0.809642,0.844739,0.862099,0.664585,0.825875,0.808554,...,0.844431,0.855809,0.690081,0.815461,0.824984,0.736467,0.813274,0.846667,Biology,Neuroscience
1,arxiv/pdf/2303.02697v2.pdf,0.615916,0.837914,0.770100,0.765567,0.864219,0.805766,0.668787,0.823477,0.832109,...,0.871955,0.859662,0.587919,0.758316,0.803632,0.633538,0.752983,0.836068,Biology,Cell Biology
2,arxiv/pdf/2306.11599v2.pdf,0.227068,0.689448,0.629003,0.253761,0.565642,0.834706,0.047291,0.392906,0.466585,...,0.662088,0.884416,0.373645,0.605593,0.757298,0.321643,0.579352,0.811447,Economics,Econometrics
3,arxiv/pdf/2306.11872v2.pdf,0.469480,0.696923,0.706822,0.520154,0.722590,0.784791,0.310934,0.597425,0.622906,...,0.745756,0.788628,0.542368,0.686400,0.730570,0.588421,0.679326,0.771728,Economics,Environmental Engineering
4,arxiv/pdf/2307.00277v1.pdf,0.456127,0.709995,0.697306,0.616335,0.691617,0.762646,0.147958,0.472825,0.517323,...,0.765158,0.831693,0.600944,0.700503,0.774227,0.641645,0.699796,0.810549,Engineering,Chemical Engineering
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23393,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_2_9_-_8...,0.000017,0.000395,0.001336,0.000025,0.000000,0.001408,0.000016,0.000298,0.001025,...,0.000000,0.001080,0.000017,0.000309,0.001069,0.000022,0.000164,0.001085,,
23394,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_3_7_-_8...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,
23395,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_4_9_-_4...,0.003677,0.014717,0.036544,0.004048,0.015212,0.039450,0.003165,0.011316,0.027598,...,0.011543,0.029316,0.002714,0.010468,0.026787,0.002511,0.010559,0.027892,,
23396,nature/pdf/s_4_1_5_8_6_-_0_2_4_-_0_7_8_6_0_-_9...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,


In [50]:
df_reduced.iloc[21396]

path                    medrxiv/pdf/10.1101_2024.01.18.24301434.pdf
bleu_nougat                                                0.472665
rouge_nougat                                               0.661425
car_nougat                                                 0.734049
bleu_nougat_norm                                           0.606234
rouge_nougat_norm                                          0.661212
car_nougat_norm                                            0.766668
bleu_pymupdf                                               0.487486
rouge_pymupdf                                              0.684468
car_pymupdf                                                0.677448
bleu_pymupdf_norm                                          0.516943
rouge_pymupdf_norm                                         0.681184
car_pymupdf_norm                                           0.700492
bleu_grobid                                                0.113127
rouge_grobid                                    