In [2]:
import openai
from openai import OpenAI
import fitz

In [3]:
import pathlib
from pathlib import Path

In [4]:
paper_name = "blms.70013"
fullpath = f"../data/BLMC/{paper_name}.pdf"

In [5]:
from dotenv import load_dotenv
import os
load_dotenv()

client = OpenAI (
    api_key = os.getenv("OPENAI_API_KEY")
)


In [6]:
filepath=Path(fullpath)
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "\n\n".join(page.get_text() for page in doc)

pdf_text = extract_text_from_pdf(filepath)

In [7]:
import json
schema = json.loads(Path("./prompts/PaperStructure.json").read_text())

prompt = (
    "Write the attached mathematics paper as a structured JSON file in the given JSON schema format. "
    "Use LaTeX for formulas but otherwise use markdown. The schema is as follows:\n\n"
    + str(schema)
)

In [15]:
response = client.chat.completions.create(
    model="o1",
    messages=[
        {"role": "system", "content": "You are an assistant that converts academic papers into structured JSON. Strictly follow the JSON schema."},
        {"role": "user", "content": prompt},
        {"role": "user", "content": pdf_text},
    ],
)

In [16]:
paper_json_out = response.choices[0].message.content
print(paper_json_out)

{
  "document": [
    {
      "title": "Distinguishing internally club and approachable on an Infinite interval"
    },
    {
      "abstract": "Krueger showed that PFA implies that for all regular Θ ≥ ℵ₂, there are stationarily many [H(Θ)]^(ℵ₁) that are internally club but not internally approachable. From countably many Mahlo cardinals, we force a model in which, for all positive n < ω and Θ ≥ ℵₙ₊₁, there is a stationary subset of [H(Θ)]^(ℵₙ) consisting of sets that are internally club but not internally approachable. The theorem is obtained using a new variant of Mitchell forcing. This answers questions of Krueger."
    },
    {
      "metadata": {
        "authors": [
          {
            "label": "Hannes Jakob",
            "affiliation": "Mathematisches Institut, University of Freiburg, Freiburg im Breisgau, Germany"
          },
          {
            "label": "Maxwell Levine",
            "affiliation": "Mathematisches Institut, University of Freiburg, Freiburg im Breisgau,

In [17]:
# Validate the output json against the schema
import jsonschema
from jsonschema import validate
def validate_json(json_data, schema):
    try:
        validate(instance=json_data, schema=schema)
        print("JSON is valid.")
    except jsonschema.exceptions.ValidationError as err:
        print("JSON is invalid.")
        print(err)

validate_json(json.loads(paper_json_out), schema)


JSON is invalid.
{'Bibliography': {'header': 'References', 'entries': [{'key': '1', 'formatted_entry': 'U. Abraham, Aronszajn trees on ℵ₂ and ℵ₃, Ann. Pure Appl. Logic 24 (1983), no. 3, 213–230.'}, {'key': '2', 'formatted_entry': 'J. Cummings, Iterated forcing and elementary embeddings, Handbook of set theory, M. Foreman and A. Kanamori (eds.), Springer, Dordrecht, 2010, pp. 775–883.'}, {'key': '3', 'formatted_entry': 'J. Cummings and M. Foreman, The tree property, Adv. Math. 133 (1998), no. 1, 1–32.'}, {'key': '4', 'formatted_entry': 'M. Foreman and S. Todorcevic, A new Löwenheim-Skolem theorem, Trans. Amer. Math. Soc. 357 (2005), no. 5, 1693–1715.'}, {'key': '5', 'formatted_entry': 'S. Fuchino and A. O. M. Rodrigues, Reflection principles, generic large cardinals, and the continuum problem, Symposium on Advances in Mathematical Logic, Springer, 2018, pp. 1–25.'}, {'key': '6', 'formatted_entry': 'L. Harrington and S. Shelah, Some exact equiconsistency results in set theory, Notre Dame

In [None]:
# Save the response to a file
output_path = Path(f"../results/BLMC/{paper_name}.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(response.choices[0].message.content, encoding="utf-8")


30529