In [None]:
from PIL import Image
import pytesseract
import pymupdf
from PIL import Image
import io
import os
import sys
from pathlib import Path
import random
import pandas as pd
from matplotlib import pyplot as plt

# add path
sys.path.append(str(Path('..').resolve()))

from table_utils import get_frames_of_choices_raw
from score_utils import calculate_bleu, calculate_rouge, calculate_car, calculate_meteor

In [None]:
# get test
_, df_test, _ = get_frames_of_choices_raw()

p_root=Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint')
p_img_dst = Path('./images')

In [None]:
%%time 

# PDFs
pdf_path_list = list(df_test['path'])
pdf_page_list = list(df_test['page'])
html_groundtruth_list = list(df_test['html'])

# tesseract OCR
effective_html_text_list = []
effective_page_list = []
image_file_name_list = []
tesseract_list = []
dpi_sizes = []

# loop elements
for pdf_file_name, page_idx, html_text in zip(pdf_path_list, pdf_page_list, html_groundtruth_list):
    # path
    pdf_path = p_root / pdf_file_name
    if pdf_path.is_file():
        doc = pymupdf.open(pdf_path) # open the PDF
        rnd_dpi = random.choice([70, 90, 110, 130, 150])
        
        pixmap = doc[page_idx].get_pixmap(dpi=rnd_dpi)
        page_img = pixmap.tobytes()
        image = Image.open(io.BytesIO(page_img))

        # image name
        img_path = str(p_img_dst / f"{Path(pdf_file_name).stem}_{page_idx}.png")
        
        # store
        image.save(img_path)

        # load tesseract image
        # does not work on Unix
        #text = pytesseract.image_to_string(image)
        #tesseract_list.append(text)

        # append lists
        image_file_name_list.append(img_path)
        effective_html_text_list.append(html_text)
        effective_page_list.append(page_idx)
        dpi_sizes.append(rnd_dpi)

        #break

# store 
df = pd.DataFrame({'image' : image_file_name_list, 
                   'html' : effective_html_text_list, 
                   'page' : effective_page_list,
                   'dpi' : dpi_sizes})

In [None]:
# df.head()
df_in = pd.read_csv('input_single_images_dpi_70_to_150.csv', sep='|')

In [None]:
df_in = pd.read_csv('input_single_images.csv', sep='|')

outputtext_list = []
for _,row in df_in.iterrows():
    try:
        image = Image.open(row['image'])
        outputtext = pytesseract.image_to_string(image)
    except:
        outputtext = '-'
    # append
    outputtext_list.append(outputtext)
df_in['tesseract'] = outputtext_list
df_out = df_in.copy()
#df_out.to_csv('output_single_images.csv')

## Load Table w `tesseract` text, process it 

Load scoring methods similar to `../statistical_tasks/datatable.py` for apples-to-apples comparison with BLEU score, lateron.

Compute and append `bleu` to table that is later merged back onto `df_test`.

In [22]:
# output table
df_out = pd.read_csv('output_single_images.csv')
df_out = df_out[['image', 'html', 'page', 'dpi', 'tesseract']]

In [23]:
# tesseract output
df_out.head()

Unnamed: 0,image,html,page,dpi,tesseract
0,images/10.1186_s13705-019-0220-5_1.png,and consider the involvement of multiple stak...,1,60,Sy ial nage cn gurl ack eel fe eon\nrid (he te...
1,images/10.1186_s13326-016-0054-4_1.png,ipher from detailed model annotations. For bot...,1,60,"reyone), need be cnpuaonaly exe fom\n‘onleraia..."
2,images/10.1186_s13326-016-0062-4_1.png,accination informed consent forms before vacci...,1,60,Dieses ad as\n\nto ccatt re tg» Sn\n‘ecaten mn...
3,images/10.1186_s13036-019-0211-2_1.png,", increasing the information capacity over the...",1,60,og al elagelnney me\n\nsoca [7-0 promoting te ...
4,images/10.1186_s13705-018-0146-3_0.png,This article reviews the use of carbon capture...,0,60,"anna oy eset BH Energy, Sustainability\n‘and S..."


In [24]:
%%time 

# compute BLEU, METEOR, ROUGE, and CAR scores and store them in new columns
df_out['bleu_tesseract'] = df_out.apply(lambda row: calculate_bleu(str(row['html']), str(row['tesseract'])), axis=1)
df_out['meteor_tesseract'] = df_out.apply(lambda row: calculate_meteor(str(row['html']), str(row['tesseract'])), axis=1)
df_out['rouge_tesseract'] = df_out.apply(lambda row: calculate_rouge(str(row['html']), str(row['tesseract'])), axis=1)
df_out['car_tesseract'] = df_out.apply(lambda row: calculate_car(str(row['html']), str(row['tesseract'])), axis=1)


CPU times: user 59 s, sys: 227 ms, total: 59.2 s
Wall time: 59.6 s


In [25]:
df_out.head()

Unnamed: 0,image,html,page,dpi,tesseract,bleu_tesseract,meteor_tesseract,rouge_tesseract,car_tesseract
0,images/10.1186_s13705-019-0220-5_1.png,and consider the involvement of multiple stak...,1,60,Sy ial nage cn gurl ack eel fe eon\nrid (he te...,0.003425,0.065668,0.01092,0.34342
1,images/10.1186_s13326-016-0054-4_1.png,ipher from detailed model annotations. For bot...,1,60,"reyone), need be cnpuaonaly exe fom\n‘onleraia...",0.000994,0.055675,0.004773,0.29497
2,images/10.1186_s13326-016-0062-4_1.png,accination informed consent forms before vacci...,1,60,Dieses ad as\n\nto ccatt re tg» Sn\n‘ecaten mn...,0.000902,0.037552,0.0,0.246326
3,images/10.1186_s13036-019-0211-2_1.png,", increasing the information capacity over the...",1,60,og al elagelnney me\n\nsoca [7-0 promoting te ...,0.00414,0.056885,0.024427,0.351302
4,images/10.1186_s13705-018-0146-3_0.png,This article reviews the use of carbon capture...,0,60,"anna oy eset BH Energy, Sustainability\n‘and S...",0.002973,0.065323,0.016032,0.36889


In [27]:
# reload `df_test` and merge onto it 
_, df_test, _ = get_frames_of_choices_raw()

  meta_split = yaml.safe_load(file)


In [29]:
set(df_test['path']).intersection()

177038     bmc/pdf/10.1186_s13705-019-0220-5.pdf
173657     bmc/pdf/10.1186_s13326-016-0054-4.pdf
173695     bmc/pdf/10.1186_s13326-016-0062-4.pdf
168519     bmc/pdf/10.1186_s13036-019-0211-2.pdf
176520     bmc/pdf/10.1186_s13705-018-0146-3.pdf
                           ...                  
174208     bmc/pdf/10.1186_s13326-017-0133-1.pdf
32143                 arxiv/pdf/2407.21187v1.pdf
178223    bmc/pdf/10.1186_s13705-022-00357-1.pdf
176915     bmc/pdf/10.1186_s13705-019-0198-z.pdf
176326     bmc/pdf/10.1186_s13705-017-0113-4.pdf
Name: path, Length: 1000, dtype: object

In [91]:
# homogenize paths (vary slightly as df_out entails page)
df_out['pdf_file_name'] = df_out['image'].str.split('/').str[-1].str.replace('.png', '').str[:-2]
df_test['pdf_file_name'] =  df_test['path'].str.split('/').str[-1].str.replace('.pdf', '')

df_new_test = df_test.merge(df_out.drop(columns=['html', 'image']), how='left', on=['pdf_file_name', 'page'])

# store merged frame
#df_new_test.to_csv('./tables/df_test_w_tesseract.csv', sep='|', index=None)

In [99]:
df_new_test[['dpi', 'bleu_tesseract']].corr()

Unnamed: 0,dpi,bleu_tesseract
dpi,1.0,0.950727
bleu_tesseract,0.950727,1.0
