In [1]:
from pathlib import Path
import pandas as pd

from pagewise_utils import get_list_of_pdf_page_lists, read_out_parser_output, end_of_path, assemble_dataframe
from pagewise_utils import get_unique_pdf_paths_from_data_list_dict, get_text_by_path, partition_fulltext_by_pagelist

## Script consumed parsed PDFs from parsers (Marker/Nougat) that don't split content by page and manually splits & stores it
#### Goal
- provide pagewise dataset for statistical tasks (e.g. given a single page, predict accuracy)
- "pumps up" dataset from $N=23,395$ documents to $n=284,470$ (well-defined) pages
- script splits the page-wise text and stores it in `.../database/pagewise` in the same format as documents are stored in `.../database` so that subsequence script for BLEU computation etc. can be appleid!

In [2]:
def assemble_pagewise_raw_parser_output(i:int,
                                        parsers:list[str]=['html','nougat', 'pymupdf', 'pypdf', 'marker', 'grobid'],
                                        store_dir:Path=Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise')):
    """
    Generates page-wise variant of `parser_output_raw.csv` of texts (path, html, ..., nougat)
    - i: core (to identify chunk)
    """

    assert i in {-1,0,1,2,3,4}, ""
    store_dir = Path(store_dir)
    assert store_dir.is_dir(), f"Store_dir does not exist. Invalid path: {store_dir}"
    
    # load PDF pagelist chunk (PyMuPDF)
    dict_of_page_lists = get_list_of_pdf_page_lists(i=i) # DEBUG -> subset to 20 for testing
    
    # assemble 
    data_list_dict = {}
    for parser in parsers:
        data_list_dict[parser] = read_out_parser_output(paths=dict_of_page_lists.keys(), parser=parser)
    
    # loop PDF pagelists (for all parsers)
    # - assemble DataFrame [path|html|nougat|pymupdf|pypdf|marker|grobid] w/ path in the style of `arxiv/pdf/2207.11282v4.pdf`
    all_paths = get_unique_pdf_paths_from_data_list_dict(data_list_dict)
    # - 
    dict_of_page_lists = {end_of_path(k):v for k,v in dict_of_page_lists.items()}

    # loop paths
    all_rows = []
    for p in all_paths:
        # parser
        for parser in parsers:
            # PyMuPDF reference exists
            if p in dict_of_page_lists.keys():
                # load parser's full text
                parser_fulltext = get_text_by_path(p, data_list_dict[parser])
    
                # if None
                if parser_fulltext is None:
                    parser_fulltext = ''
                
                # load page list from PyMuPDF
                page_list = dict_of_page_lists[p]
                
                # split
                if len(parser_fulltext) > 0:
                    parsed_page_list = partition_fulltext_by_pagelist(parser_fulltext, page_list)
                else:
                    parsed_page_list = {k : '' for k in range(len(page_list))}
    
                # assemble
                # - grab from source (not the secondary parsing)
                if parser=='pymupdf':
                    for page_idx, page_text in enumerate(page_list):
                        row = {'path' : p, 'page' : page_idx, 'text' : page_text, 'parser' : 'pymupdf'}
                        # - append
                        all_rows.append(row)
                else:
                    for page_idx, page_text in parsed_page_list.items():
                        row = {'path' : p, 'page' : page_idx, 'text' : page_text, 'parser' : parser}
                        # - append
                        all_rows.append(row)
    # PyMUPDF
    df = assemble_dataframe(all_rows, i, store_dir)
    
    # store
    df.to_csv(store_dir / f'pagewise_parser_output_raw_{i}_5.csv', sep='|', index=None)

    pass

In [3]:
%%time
assemble_pagewise_raw_parser_output(i=0)

MuPDF error: syntax error: could not parse color space (254 0 R)

MuPDF error: syntax error: could not parse color space (525 0 R)

MuPDF error: syntax error: could not parse color space (215 0 R)

MuPDF error: syntax error: cannot find ExtGState resource 'GS10'

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: syntax error: could

### Goal
Create table (in parallel) with columns 
```
path, html, pymupdf, pypdf, nougat, grobid, marker
```

Do so in parallel (be inspired by `get_tables.py`, merge afterwards). Subsequently, run `get_tables.py` on that merged table (also in parallel). 

Note that `path` is (from now on) `arxiv/pdf/2207.11282v4.pdf` etc. not the entire (actual path) - we call thos `abs_path`

In [17]:
import os

os.listdir('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/')

['pagewise_parser_output_raw_0_5.csv']

In [27]:
%%time
df0 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw_0_5.csv', sep='|')
df1 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw_1_5.csv', sep='|')
df2 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw_2_5.csv', sep='|')
df3 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw_3_5.csv', sep='|')
df4 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw_4_5.csv', sep='|')



CPU times: user 47.5 s, sys: 3.71 s, total: 51.2 s
Wall time: 53.7 s


In [54]:
# store
df_list = [df0, df1, df2, df3, df4]
merged_df = pd.concat(df_list, ignore_index=True)
merged_df.to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw.csv', index=False, sep='|')

In [55]:
merged_subset = merged_df[(~merged_df['pymupdf'].isna()) & (~merged_df['nougat'].isna()) & (~merged_df['html'].isna())]

In [56]:
merged_subset.fillna('-').to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw.csv', index=False, sep='|')

In [41]:
loaded_merged_df = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw.csv', sep='|')



  loaded_merged_df = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw.csv', sep='|')


In [53]:
loaded_merged_df.iloc[:100].fillna('-').to_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/MINI_pagewise_parser_output_raw.csv', index=False, sep='|')

In [50]:
loaded_merged_df.iloc[:100].fillna('')

Unnamed: 0,path,page,grobid,html,marker,nougat,pymupdf,pypdf
0,arxiv/pdf/1009.1049v1.pdf,0,,,# Understanding How Students Use Physical Idea...,# Understanding How Students Use Physical Idea...,Understanding How Students Use Physical Ideas ...,Understanding How Students Use Ph...
1,arxiv/pdf/1009.1049v1.pdf,1,,,nted that student views about physics knowledg...,kinds of knowledge and learning their courses...,"In physics education, researchers have documen...","iplines, (2) kinds of knowl..."
2,arxiv/pdf/1009.1049v1.pdf,2,,,to the biological system. When the students we...,o remember and call out that \(J\) is the diff...,sion when you make a given change to the biolo...,s and facilitate quantitative reasoning for in...
3,arxiv/pdf/1009.1049v1.pdf,3,,,"en 'this is t,' I can't do it. Like, it's just...","ngible, perceivable, and to put that in terms ...","thing through it, the thicker it is, obviously...","oving, underwater. Given these constraints, st..."
4,arxiv/pdf/1011.0510v3.pdf,0,,,# Theory Of Cooperation In A Micro-Organismal ...,# Theory of cooperation in a micro-organism sn...,Theory of cooperation in a micro-organismal sn...,
...,...,...,...,...,...,...,...,...
95,arxiv/pdf/1103.1791v2.pdf,15,,,and that have the capacity to observe *their o...,still be elevated even when navigating by mem...,"For example, in Fig. 4, the opening-direction ...",
96,arxiv/pdf/1103.1791v2.pdf,16,,,are encoded within the genes that specify the ...,bit in memory and integrating this informatio...,where g(i)(m) is the ith stochastic realizatio...,
97,arxiv/pdf/1103.1791v2.pdf,17,,,erated for each run so that the animats would ...,{i=1}^{10}\frac{g^{(i)}(m)}{g_{\rm opt}(m)}\ri...,Motor variables that are not read from are una...,
98,arxiv/pdf/1103.1791v2.pdf,18,,,ards towards the randomly constructed ancestra...,variables to test their effect on the Markov a...,4. Adami C (2009) Biological complexity and bi...,


In [None]:
%%time 

# df_filled = df.fillna('')
/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/MINI_pagewise_parser_output_raw.csv

# Nougat, Marker etc.
# - open jsonl (if exists)
# - fuzzy-match PyMuPDF-piece over text
# - split

In [None]:
len(parser_data_list), len(paths)

In [None]:
# Function to find and copy the text efficiently using binary search


## Experimental Code startes here ..

In [None]:
pdf_abs_path = parser_data_list[14]['path']

In [None]:
# search resp. fulltext
parser_fulltext = get_text_by_path(pdf_abs_path, parser_data_list)

# split
P = partition_fulltext_by_pagelist(parser_fulltext, page_list)

In [None]:
P.keys()

In [None]:
P[15]

In [None]:
!cp /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint/arxiv/pdf/1411.0940v1.pdf 1411.0940v1.pdf

In [None]:
parser_data_list[5]['text'][:50]

In [2]:
import pandas as pd
dfff = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/pagewise/pagewise_parser_output_raw.csv', sep='|')

In [3]:
len(dfff)

284470