In [53]:
import arxiv
import pandas as pd
from langdetect import detect
import datetime
import pdfreader
from io import BytesIO
from pdfreader import PDFDocument, SimplePDFViewer
import re
import json
from pdfminer.high_level import extract_text
from collections import defaultdict

In [None]:
paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
paper.download_pdf('.\paper_storage', filename='article_001_labeled(1).pdf')

'.\\paper_storage\\article_001_labeled(1).pdf'

In [66]:
SUMMARY = paper.summary
TITLE = paper.title
LANGUAGE = detect(SUMMARY)
PUBLISHED_DATE = paper.published.strftime("%Y-%m-%d %H:%M:%S %Z")
AUTHORS = [author.name for author in paper.authors]


In [None]:
file_name = r'paper_storage\article_001_labeled(1).pdf'

with open(file_name, "rb") as f:
    stream = BytesIO(f.read())
doc2 = PDFDocument(stream)

In [69]:
import re
import json
from pdfminer.high_level import extract_text

SECTION_TITLES = [
    "Abstract", "Introduction", "Related Work", "Background",
    "Method", "Approach", "Experiments", "Results",
    "Conclusion", "Discussion"
]

def extract_pdf_text(stream):
    return extract_text(stream)

def build_section_regex(titles):
    pattern = '|'.join([fr'\b{re.escape(title)}\b' for title in titles])
    return re.compile(fr'(?P<header>{pattern})', re.IGNORECASE)

def split_sections(text, section_regex):
    sections = []
    matches = list(section_regex.finditer(text))
    
    for i, match in enumerate(matches):
        title = match.group('header').strip().title()
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        content = text[start:end].strip()
        sections.append({
            "name": title,
            "text": content
        })
    return sections

def build_ldjson_from_pdf(stream, title, published_date, language, summary, authors):
    """
    Build ld+json structured metadata from a PDF stream and metadata.
    """
    raw_text = extract_pdf_text(stream)
    section_regex = build_section_regex(SECTION_TITLES)
    sections = split_sections(raw_text, section_regex)

    # Prefer the passed summary but fall back to Abstract section if summary is empty
    abstract = summary.strip() if summary else ""
    if not abstract:
        for section in sections:
            if section["name"].lower() == "abstract":
                abstract = section["text"]
                sections.remove(section)
                break

    ld_json = {
        "name": title,
        "datePublished": published_date,
        "inLanguage": language,
        "author": [{"@type": "Person", "name": name} for name in authors],
        "abstract": abstract,
        "articleBody": sections
    }

    return json.dumps(ld_json, indent=2)

with open(r"paper_storage\article_001_labeled(1).pdf", "rb") as f:
    result_json = build_ldjson_from_pdf(f, TITLE, PUBLISHED_DATE, LANGUAGE, SUMMARY, AUTHORS)
    print(result_json)

{
  "name": "Heat-bath random walks with Markov bases",
  "datePublished": "2016-05-26 17:59:46 UTC",
  "inLanguage": "en",
  "author": [
    {
      "@type": "Person",
      "name": "Caprice Stanley"
    },
    {
      "@type": "Person",
      "name": "Tobias Windisch"
    }
  ],
  "abstract": "Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.",
  "articleBody": [
    {
      "name": "Abstract",
      "text": "Abstract. Graphs on lattice points are studied whose edges come from a \ufb01nite set of\nallowed moves of arbitrary length. We show that the diameter of th