In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os 
import json
import re
from collections import defaultdict
import processing_pdf
from gpt_summary import get_summaries

Define the input and output path.

In [3]:
# path to the folder containing PDF files
dataset_path = "../../PDF-Datasets-01/"
# output folder and file name
output_path = "dataset"
output_file_name = "dataset.json"
processing_pdf.clear_processed_folder(output_path)

Step 0: Remove duplicated files, only keep the paper with newest version

In [55]:
# Function to extract the base filename without the version suffix
def get_base_filename(filename):
    match = re.match(r'^(.*?)v\d\.pdf+$', filename)
    if match:
        return match.group(1)
    else:
        return filename

# Function to find the newest version of each file
def find_newest_versions(folder):
    files_by_base_name = defaultdict(list)

    # Group files by base filename
    for filename in os.listdir(folder):
        base_name = get_base_filename(filename)
        files_by_base_name[base_name].append(filename)

    # Find the newest version of each file
    newest_versions = []
    for base_name, filenames in files_by_base_name.items():
        newest_version = max(filenames, key=lambda x: int(re.search(r'v(\d+)\.pdf$', x).group(1)))
        newest_versions.append(newest_version)

    return newest_versions

# Function to remove duplicated files
def remove_duplicates(folder):
    newest_versions = find_newest_versions(folder)

    for filename in os.listdir(folder):
        if filename not in newest_versions:
            file_path = os.path.join(folder, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)

remove_duplicates(dataset_path)

Step 1: Breakdown PDF content by sections and subsections.

In [4]:
def process_one_pdf(file_path):
    doc, total_text, _ = processing_pdf.open_file(file_path)
    table_of_content = doc.get_toc()

    if len(table_of_content) > 0:
        print("Auto generated table of content:")
    # some papers have not table of content
    if len(table_of_content) == 0:
        print("The paper has not table of content. Need to use regular expression to map table of content.")
        table_of_content = processing_pdf.auto_find_toc(doc)
        
    display(table_of_content)

    # separate content into sections
    _, json_dict = processing_pdf.separate_content(total_text, table_of_content)
    return json_dict

In [5]:
# number of PDFs used for training data preparation
dataset_limit = 100

file_count = 0
all_json_dicts = []
# Loop through files in the folder
for file_name in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, file_name)
    # Check if it's a file and if it has a ".pdf" extension
    if os.path.isfile(file_path) and file_name.endswith('.pdf'):
        # Process the PDF file
        print("Processing PDF file:", file_name)
        json_dict = process_one_pdf(file_path)
        all_json_dicts.extend(list(json_dict.values()))
        print("Done with PDF file:", file_name)
        print("# of sections:", len(json_dict.values()))
        print("Total # of sections:", len(all_json_dicts))
        print(60*"=")
     
        # Terminate when reaching dataset limit
        file_count += 1
        if file_count >= dataset_limit:
            break

get_summaries(all_json_dicts)
json_list = json.dumps(all_json_dicts)
full_name = "dataset/dataset.json"
with open(full_name, "w") as jsonfile: 
    jsonfile.write(json_list)

Processing PDF file: 1901.00009v2.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Observations and Data reduction', 2],
 [1, '3 Variability Analysis', 3],
 [2, '3.1 Cross-matches to external catalogs', 3],
 [2, '3.2 Variability cutoffs', 4],
 [2, '3.3 Variability Classification', 4],
 [2, '3.4 Blending Corrections', 4],
 [1, '4 Results', 5],
 [1, '5 Conclusions', 7]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00009v2.pdf
# of sections: 6
Total # of sections: 6


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

Step 2: Generate GPT summary for each section and subsection as ground truth.