In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os 
import json
import re
from collections import defaultdict
import processing_pdf

Define the input and output path.

In [3]:
# path to the folder containing PDF files (1512, removed researchpaper1.pdf)
dataset_path = "F:\Datasets/"
# output folder and file name
# output_path = "dataset"
# processing_pdf.clear_processed_folder(output_path)

Step 0: Remove duplicated files, only keep the papers (941) with newest version

In [7]:
# Function to extract the base filename without the version suffix
def get_base_filename(filename):
    match = re.match(r'^(.*?)v\d\.pdf+$', filename)
    if match:
        return match.group(1)
    else:
        return filename

# Function to find the newest version of each file
def find_newest_versions(folder):
    files_by_base_name = defaultdict(list)

    # Group files by base filename
    for filename in os.listdir(folder):
        base_name = get_base_filename(filename)
        files_by_base_name[base_name].append(filename)

    # Find the newest version of each file
    newest_versions = []
    for base_name, filenames in files_by_base_name.items():
        newest_version = max(filenames, key=lambda x: int(re.search(r'v(\d+)\.pdf$', x).group(1)))
        newest_versions.append(newest_version)

    return newest_versions

# Function to remove duplicated files
def remove_duplicates(folder):
    newest_versions = find_newest_versions(folder)

    for filename in os.listdir(folder):
        if filename not in newest_versions:
            file_path = os.path.join(folder, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)

remove_duplicates(dataset_path)

Step 1: Breakdown PDF content by sections and subsections.

In [4]:
def process_one_pdf(file_path):
    doc, total_text, _ = processing_pdf.open_file(file_path)
    table_of_content = doc.get_toc()

    if len(table_of_content) > 0:
        print("Auto generated table of content:")
        display(table_of_content)
        # separate content into sections
        _, json_dict = processing_pdf.separate_content(total_text, table_of_content)
        return json_dict
    # some papers have not table of content
    #if len(table_of_content) == 0:
    #    print("The paper has not table of content. Need to use regular expression to map table of content.")
    #    table_of_content = processing_pdf.auto_find_toc(doc)
    #    display(table_of_content)
    
    return {}

In [5]:
# number of PDFs used for training data preparation
dataset_limit = 20

file_count = 0
all_json_dicts = []
# Loop through files in the folder
for file_name in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, file_name)
    # Check if it's a file and if it has a ".pdf" extension
    if os.path.isfile(file_path) and file_name.endswith('.pdf'):
        # Process the PDF file
        print("Processing PDF file:", file_name)
        json_dict = process_one_pdf(file_path)
        if len(json_dict) > 0:
            all_json_dicts.extend(list(json_dict.values()))
            file_count += 1
            print("Done with PDF file:", file_name)
            print("# of sections:", len(json_dict.values()))
            print("Total # of sections:", len(all_json_dicts), "Total # of files:", file_count)
            print(60*"=")
        
        # Terminate when reaching dataset limit
        if file_count >= dataset_limit:
            break

# get_summaries(all_json_dicts)
json_list = json.dumps(all_json_dicts)
full_name = "dataset/dataset_eval.json"
with open(full_name, "w") as jsonfile: 
    jsonfile.write(json_list)

Processing PDF file: 1901.00100v1.pdf
Processing PDF file: 1901.00101v1.pdf
Auto generated table of content:


[[1, 'I Introduction', 1],
 [1, 'II Related Work', 2],
 [1, 'III Safety-Guided RRT  via Probabilistically Safe Corridors', 2],
 [2, 'III-A Gaussian Mixture Modeling of Configuration Spaces', 2],
 [3, 'III-A.1 Learning Gaussian Mixtures', 3],
 [3, 'III-A.2 Confidence Regions of Gaussian Mixtures', 3],
 [2, 'III-B Probabilistically Safe Corridors', 4],
 [2, 'III-C Guided Steering via Safe Corridors', 5],
 [3, 'III-C.1 Tree Extension in the Configuration Space', 5],
 [3, 'III-C.2 Tree Extension in the Task Space', 6],
 [3, 'III-C.3 GMM-based Biased Sampling', 6],
 [1, 'IV Results', 6],
 [2, 'IV-A Learning Gaussian Mixture Models', 6],
 [2, 'IV-B 2DoF Planar Manipulator', 6],
 [2, 'IV-C 7DoF Manipulator in 3D Space', 8],
 [2, 'IV-D Physical Robot Experiments', 8],
 [1, 'V Discussion', 9],
 [1, 'References', 9]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00101v1.pdf
# of sections: 7
Total # of sections: 7 Total # of files: 1
Processing PDF file: 1901.00102v1.pdf
Auto generated table of content:


[[1, '1 Introduction', 2],
 [1, '2 The bremsstrahlung cross section with screening potential', 4],
 [1, '3 Comparing with the Bethe-Heitler formula', 14],
 [1, '4 Summary', 21]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00102v1.pdf
# of sections: 5
Total # of sections: 12 Total # of files: 2
Processing PDF file: 1901.00103v1.pdf
Processing PDF file: 1901.00104v1.pdf
Auto generated table of content:


[[1, '1. Introduction', 1],
 [1, '2. Background', 2],
 [2, 'notation', 2],
 [2, '2.1. Some properties of characters of KR modules', 3],
 [2, '2.2. fermionic formula', 5],
 [2, '2.3. polyhedral formula', 5],
 [1, '3. framework for proving polyhedral formula', 6],
 [1, '4. Proof of Theorem ??', 7],
 [2, '4.1. Step ??', 8],
 [2, '4.2. Step ??', 10],
 [1, 'References', 13]]

MuPDF error: syntax error: unknown keyword: 'pagesize'
MuPDF error: syntax error: unknown keyword: 'width'
MuPDF error: syntax error: unknown keyword: '614.295pt'
MuPDF error: syntax error: unknown keyword: 'height'
MuPDF error: syntax error: unknown keyword: '794.96999pt'


starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00104v1.pdf
# of sections: 6
Total # of sections: 18 Total # of files: 3
Processing PDF file: 1901.00105v2.pdf
Auto generated table of content:


[[1, 'Top-Assisted Di-Higgs boson Production Motivated by Baryogenesis', 1],
 [2, 'Abstract', 1],
 [2, 'I Introduction', 1],
 [2, 'II Formalism', 1],
 [2, 'III Collider signature', 2],
 [2, 'IV Discussion and summary', 5],
 [2, ' Acknowledgments', 6],
 [2, ' References', 6]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00105v2.pdf
# of sections: 1
Total # of sections: 19 Total # of files: 4
Processing PDF file: 1901.00106v2.pdf
Processing PDF file: 1901.00107v2.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Continuous-stage RKN method and its order theory', 3],
 [2, '2.1 Continuous-stage RKN method', 3],
 [2, '2.2 Order theory for RKN-type method', 4],
 [1, '3 Conditions for the symmetry of csRKN methods', 7],
 [1, '4 Symmetric RKN method', 9],
 [1, '5 Numerical experiments', 12],
 [1, '6 Concluding remarks', 13]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00107v2.pdf
# of sections: 7
Total # of sections: 26 Total # of files: 5
Processing PDF file: 1901.00108v2.pdf
Processing PDF file: 1901.00109v4.pdf
Processing PDF file: 1901.00110v1.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Triadic time series motifs', 2],
 [1, '3 Triadic time series motif analysis of chaotic maps', 3],
 [2, '3.1 Chaotic maps', 3],
 [2, '3.2 Occurrence frequency distributions of triadic motifs', 3],
 [2, '3.3 Classification of time series', 5],
 [1,
  '4 Triadic time series motif analysis of the UCR Time Series Classification Archive',
  7],
 [1, '5 Conclusions', 8]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00110v1.pdf
# of sections: 6
Total # of sections: 32 Total # of files: 6
Processing PDF file: 1901.00111v1.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Preliminaries', 3],
 [1, '3 Model formulation', 4],
 [1, '4 Stability analysis with respect to equilibria', 7],
 [2, '4.1 Local stability', 7],
 [2, '4.2 Global attractivity', 9],
 [1, '5 Numerical simulations', 12],
 [2, '5.1 Three typical network models', 12],
 [2, '5.2 Impact of system parameters', 14],
 [1, '6 Conclusions and discussions', 15]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00111v1.pdf
# of sections: 7
Total # of sections: 39 Total # of files: 7
Processing PDF file: 1901.00112v1.pdf
Processing PDF file: 1901.00113v1.pdf
Processing PDF file: 1901.00114v2.pdf
Processing PDF file: 1901.00115v2.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Morse index and bifurcation', 3],
 [2, '2.1 Morse index and eigenvalue problem', 3],
 [2, '2.2 Bifurcation and eigenvalue problem', 4],
 [2, '2.3 Morse index and bifurcation', 5],
 [1, '3 Morse index and bifurcation for homogeneous system', 6],
 [2, '3.1 Bifurcation at a=0.9966', 6],
 [2, '3.2 Bifurcation at a=1.3424', 9],
 [1, '4 Morse index and bifurcation for LJ system', 11],
 [2, '4.1 Bifurcation yielding Dx y, Dx and D2 solutions', 11],
 [2, '4.2 Choreographic bifurcation', 12],
 [1, '5 Summary and discussions', 15],
 [1, 'Appendix A Conditions for solutions', 17],
 [2, 'Appendix A.1 Dx y solution', 18],
 [2, 'Appendix A.2 Dx solution', 18],
 [2, 'Appendix A.3 Cx solution', 18],
 [2, 'Appendix A.4 D2 solution', 19],
 [2, 'Appendix A.5 C2 solution', 19],
 [2, 'Appendix A.6 Cy solution', 19]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00115v2.pdf
# of sections: 7
Total # of sections: 46 Total # of files: 8
Processing PDF file: 1901.00116v1.pdf
Processing PDF file: 1901.00117v2.pdf
Auto generated table of content:


[[1, 'Abstract', 1],
 [1, '1 Introduction', 1],
 [1, '2 Related Work', 1],
 [1, '3 Background', 2],
 [2, '3.1 RL on an Ensemble of Models', 2],
 [2, '3.2 Robust policy learning via CVaR optimization', 2],
 [2, '3.3 Linear Stochastic Bandits', 2],
 [1, '4 Active Learning for Efficient Trajectory Sampling', 2],
 [2, '4.1 Active Learning and the EffAcTS framework', 3],
 [2, '4.2 Applying EffAcTS', 3],
 [2, '4.3 Sample Efficiency', 4],
 [1, '5 Connections to Multi-Task Learning', 4],
 [1, '6 Experiments', 4],
 [2, '6.1 Implementation Details and Hyperparameters', 4],
 [2, '6.2 Bandit Algorithm', 6],
 [2, '6.3 (RQ1) Performance and Robustness', 6],
 [2, '6.4 (RQ1) Performance on a 2-D Model Ensemble', 6],
 [2, '6.5 Visualizing the Bandit Active Learner', 7],
 [2, '6.6 (RQ2) Analysis of the Bandit Active Learner', 7],
 [2, '6.7 (RQ3) Non-stationary Bandits for Data Reuse', 7],
 [2, '6.8 Other Remarks', 8],
 [1, '7 Conclusions and Further Possibilities', 8],
 [1, '8 Acknowledgments', 8],
 [1,

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00117v2.pdf
# of sections: 11
Total # of sections: 57 Total # of files: 9
Processing PDF file: 1901.00118v2.pdf
Auto generated table of content:


[[1, '1. Introduction', 1],
 [1, '2. Énoncé précis du résultat', 2],
 [1, '3. Démonstrations du énoncé', 4],
 [1, 'Références', 7]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00118v2.pdf
# of sections: 5
Total # of sections: 62 Total # of files: 10
Processing PDF file: 1901.00120v2.pdf
Processing PDF file: 1901.00121v1.pdf
Auto generated table of content:


[[1, 'I Introduction', 1],
 [2, 'I-A Applications of Deep Learning Networks', 2],
 [2, 'I-B Emergence of Deep Learning Networks', 2],
 [2, 'I-C Hardware Acceleration of Deep Learning Networks', 2],
 [1, 'II Background and Terminology', 3],
 [2, 'II-A Convolutional Neural Networks (CNNs)', 3],
 [3, 'II-A1 Convolution (CONV)', 3],
 [3, 'II-A2 Activation Functions (AFs)', 4],
 [3, 'II-A3 Normalization', 4],
 [3, 'II-A4 Pooling', 4],
 [3, 'II-A5 Fully Connected Layer (FC)', 5],
 [2, 'II-B Examples of Deep Learning Networks', 5],
 [2, 'II-C Field Programmable Gate Arrays (FPGAs)', 5],
 [2,
  'II-D Challenges of FPGA-Based Implementation of Deep Learning Networks',
  6],
 [1, 'III Acceleration of Deep Learning Networks: Current Status', 7],
 [2, 'III-A CNNs Compression', 7],
 [2, 'III-B ASIC-based Accelerators', 7],
 [2, 'III-C FPGA-based Accelerators', 9],
 [1, 'IV Metaheuristics in the Design of Convolutional Neural Networks', 28],
 [2, 'IV-A CNN Structure Optimization', 31],
 [2, 'IV-B CN

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00121v1.pdf
# of sections: 8
Total # of sections: 70 Total # of files: 11
Processing PDF file: 1901.00122v1.pdf
Processing PDF file: 1901.00123v2.pdf
Auto generated table of content:


[[1, '1. Introduction', 1],
 [2, '1.1. Definitions and main result', 2],
 [2, '1.2. Discussion', 3],
 [2, '1.3. Acknowledgments', 4],
 [2, '1.4. Notation', 5],
 [1, '2. Outline of proof', 5],
 [1, '3. Preliminaries', 8],
 [2, '3.1. Entropy', 8],
 [2, '3.2. The mass-transport principle', 8],
 [2, '3.3. Simulating distributions from random bits', 9],
 [1, '4. The cell process', 9],
 [1, '5. Random total orders', 15],
 [1, '6. The finitary coding', 17],
 [2, '6.1. Choosing the parameters', 18],
 [2, '6.2. The construction of the finitary coding', 18],
 [2, '6.3. Concluding Theorem 1.2', 21],
 [2, '6.4. The output has the correct distribution', 22],
 [1, '7. Remarks and open problems', 25],
 [1, 'References', 27]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00123v2.pdf
# of sections: 9
Total # of sections: 79 Total # of files: 12
Processing PDF file: 1901.00124v1.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Background', 3],
 [2, '2.1 Local Bifurcation Theory', 3],
 [2, '2.2 Piecewise Deterministic Markov Processes', 5],
 [1, '3 Two Nontrivial Trapping Regions', 8],
 [2, '3.1 Supercritical Pitchfork Bifurcation', 8],
 [2, '3.2 Supercritical Hopf Bifurcation', 10],
 [2, '3.3 Transcritical Bifurcation', 12],
 [1, '4 One Nontrivial Trapping Region', 16],
 [2, '4.1 Subcritical Pitchfork Bifurcation', 16],
 [2, '4.2 Subcritical Hopf Bifurcation', 17],
 [2, '4.3 Fold Bifurcation', 18],
 [1, '5 Applications', 19],
 [2, '5.1 The Paradox of Enrichment', 19],
 [2, '5.2 Relaxation Oscillations', 20],
 [2, '5.3 Adaptive Swarming', 20]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00124v1.pdf
# of sections: 6
Total # of sections: 85 Total # of files: 13
Processing PDF file: 1901.00125v3.pdf
Auto generated table of content:


[[1, ' Acknowledgments', 4],
 [1, ' References', 4],
 [1, ' S1: Exact results on complete graphs', 6],
 [1, ' S2: The random-graph Potts model in the canonical ensemble', 7],
 [1,
  ' S3: The entropy kink at umic and the microcanonical inverse temperature',
  8],
 [1, ' S4: The Potts model on RR graphs of large degree K', 10],
 [1,
  ' S5: The Potts model with large Q values on RR graphs of fixed degree K',
  11],
 [1, ' S6: Potts model with kinetic energies', 12],
 [1,
  ' S7: Bond-diluted lattice systems and short-range interaction range l',
  13],
 [1, ' S8: Droplet nucleation and phase separation', 14]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00125v3.pdf
# of sections: 11
Total # of sections: 96 Total # of files: 14
Processing PDF file: 1901.00126v1.pdf
Auto generated table of content:


[[1, ' References', 8]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00126v1.pdf
# of sections: 1
Total # of sections: 97 Total # of files: 15
Processing PDF file: 1901.00127v1.pdf
Processing PDF file: 1901.00128v1.pdf
Auto generated table of content:


[[1, 'I Introduction', 1],
 [1, 'II Materials and Method', 2],
 [2, 'II-A Spiking Neuron', 2],
 [2, 'II-B Crossbar Array of Synapses', 2],
 [1, 'III MaD Framework', 3],
 [2, 'III-A Mapping Function', 3],
 [2, 'III-B Core Utilization', 3],
 [2, 'III-C MaD Framework Optimizations', 4],
 [3, 'III-C1 Core Utilization', 4],
 [3, 'III-C2 Padding', 5],
 [1, 'IV Results', 5],
 [2, 'IV-1 Keeping architecture constant', 5],
 [3, 'IV-2 Keeping architecture different', 6],
 [1, 'V Discussion and Conclusion', 6],
 [1, 'References', 7]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00128v1.pdf
# of sections: 7
Total # of sections: 104 Total # of files: 16
Processing PDF file: 1901.00129v1.pdf
Auto generated table of content:


[[1, 'Introduction', 1],
 [1, '1. Background material', 3],
 [1, '2. Construction of the maximal surface', 10],
 [1, '3. Description of the boundary at infinity', 13],
 [1, '4. Parameterisation of wild anti-de Sitter structures', 23],
 [1, 'References', 26]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00129v1.pdf
# of sections: 7
Total # of sections: 111 Total # of files: 17
Processing PDF file: 1901.00130v1.pdf
Auto generated table of content:


[[1, 'I Introduction', 1],
 [1, 'II Advantages of Deep Nets in Realizing Feature', 2],
 [2, 'II-A Deep nets with fixed structures', 2],
 [2, 'II-B A fast review for realizing data features by deep nets', 3],
 [2, 'II-C Covering number estimates', 3],
 [1, 'III Necessity of the Depth', 4],
 [2, 'III-A Limitations of deep nets approximation', 4],
 [2, 'III-B Remarks and discussions', 5],
 [1, 'IV Conclusion', 5],
 [1, 'References', 11]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00130v1.pdf
# of sections: 6
Total # of sections: 117 Total # of files: 18
Processing PDF file: 1901.00131v4.pdf
Auto generated table of content:


[[1, '1 Introduction', 1],
 [1, '2 Martingale approximations', 4],
 [1, '3 Main abstract theorem', 8],
 [2, '3.1 Verifying condition (b) in Theorem ??', 10],
 [2, '3.2 Verifying condition (a) in Theorem ??', 11],
 [1, '4 Application to Lorentz gases', 12],
 [2, '4.1 Setting and main result for Lorentz gases', 12],
 [2, '4.2 Proof of Theorem ??', 13]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00131v4.pdf
# of sections: 5
Total # of sections: 122 Total # of files: 19
Processing PDF file: 1901.00132v1.pdf
Auto generated table of content:


[[1, 'I Introduction and related work', 1],
 [1, 'II A real-world dataset', 1],
 [1, 'III Forecasting technique', 2],
 [1, 'IV Numerical results', 2],
 [2, 'IV-A Performance metrics', 2],
 [2, 'IV-B Results', 3],
 [1, 'V Conclusion and Future Work', 4],
 [1, 'References', 4]]

starting looking for all the sections according to the provided section title info...
Done with PDF file: 1901.00132v1.pdf
# of sections: 7
Total # of sections: 129 Total # of files: 20


Step 2: Generate GPT summary for each section and subsection as ground truth.

In [6]:
from gpt_summary import get_summaries

#file = f"dataset/test.json"
file = f"dataset/dataset_eval.json"
with open(file, encoding='utf-8') as f:
    data = json.load(f)
get_summaries(data)

with open(f"dataset/dataset_eval_ground_truth.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)