In [41]:
from pathlib import Path
from PyPDF2 import PdfReader
from openai import OpenAI
import fitz  # PyMuPDF
from dataclasses import dataclass, field
import json

In [3]:
OPEN_AI_KEY = "<YOUR_OPEN_AI_KEY>"  # <YOUR_OPEN_AI_KEY>

In [4]:
data_root = Path("/home/chris/data/pdf_stuff")

## Test one PDF to extract the text

In [15]:
fn = Path(
    "/home/chris/data/pdf_stuff/D.9.3 Synthesis of the model, selected results, and scenario assessment-1.pdf"
)
reader = PdfReader(fn)
number_of_pages = len(reader.pages)
page = reader.pages[0:2]

text = []
# title = extract_title_from_pdf(pdf)
for page in reader.pages[0:2]:
    text.append(page.extract_text())
formatted_txt = "".join(text)  # from list to string
print(formatted_txt)

 
This project  has received  funding  from  the European Union’s 
Horizon 2020 research and innovation programme under grant 
agreement No 821105.   
 
 
 
Synthesis of the model, 
selected results,  and 
scenario assessment  
WP9, Task 9.2, D.9.3 
31 August  2023 
LOW -CARBON SOCIETY: AN ENHANCED MODELLING TOOL FOR  
THE TRANSITION TO SUSTAINABILITY (LOCOMOTION)  
H2020 -LC-CLA -2018 -2 
 
 
 
 I / XI 
SYNTHESIS OF THE MODEL, SELECTED RESULTS, AND SCENARIO ASSESSMENT  
DOCUMENT HISTORY  
 
Project Acronym  LOCOMOTION  
Project t itle Low-carbon society: an enhanced modelling tool for the transition to 
sustainability  
Project c oordinat ion Universidad de Valladolid (Spain)  
Project d uration  1st June 2019 – 30th November  2023 
Deliverable No.  D9.3. Synthesis of the model, selected results and scenario assessment  
Dissemination level  Confidential (CO)  / Public (PU)  
Status   In progress  
 To be verifi ed by other WPs  
 Final  
Due d ate of deliverable  31st of August  202

In [6]:
# Check the metadata
reader.metadata

{'/Author': 'LOCOMOTION TEAM',
 '/CreationDate': "D:20230831112734+02'00'",
 '/Creator': 'Microsoft® Word para Microsoft 365',
 '/Keywords': 'Locomotion,H2020,Deliverable, template',
 '/ModDate': "D:20230831112756+02'00'",
 '/Producer': 'Microsoft® Word para Microsoft 365',
 '/Subject': 'Deliverable',
 '/Title': 'LOCOMOTION-Deliverable'}

## Function to extract the title of the PDF using the largest font

In [7]:
def extract_title_from_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    # Check if PDF is not empty
    if len(doc) > 0:
        # Get the first page
        first_page = doc[0]
        # Dictionary to hold text and its font size
        text_font_size = {}
        # Extract text blocks
        for text_block in first_page.get_text("dict")["blocks"]:
            if "lines" in text_block:  # Ensure it's a text block
                for line in text_block["lines"]:
                    for span in line["spans"]:
                        # Store text and its font size
                        text_font_size[span["text"]] = span["size"]
        # Find the text with the largest font size
        title = max(text_font_size, key=text_font_size.get)
        return title
    else:
        return "PDF is empty"

## Function to send the text to ChatGPT and get the title and keywords back

In [18]:
def extract_from_openai(text):
    client = OpenAI(api_key=OPEN_AI_KEY)

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": "You are a scientific assistant, skilled in detecting titles and keywords in PDF documents, and outputing a JSON object. \
                    For all the content you get try to find the existing title in the document. If you find it in the document, return it. If you don't find it in the document, make up one based only on the content.\
                    In addition, try to find 5 keywords in the document. If you don't find any keywords, return 5 keywords based on the content of the document.",
            },
            {
                "role": "assistant",
                "content": '{"title": "Defining happiness in Amazonian forest tribes",  "keywords": ["happiness", "social science","primitive socities","anthropology","Sout America"]}',
            },
            {"role": "user", "content": text},
        ],
    )
    return completion.choices[0].message

## Test one API call

In [20]:
message = extract_from_openai(formatted_txt)

In [22]:
message.content

'{"title": "Low-carbon society: Synthesis of model and scenario assessment",  "keywords": ["Horizon 2020", "low-carbon society", "sustainability", "modelling tool", "transition"]}'

## Loop over all PDFs

In [13]:
@dataclass
class PDF:
    original_pdf_name: str
    title_from_metadata: str = ""
    title_from_openai: str = ""
    keywords_from_metadata: list = field(default_factory=list)
    keywords_from_openai: list = field(default_factory=list)
    process_error: bool = False

In [59]:
NUM_PAGES_TO_EXTRACT = 2

count = 0
entity_lst = []

lst_pdf = list(data_root.glob("*.pdf"))
for idx_pdf, pdf in enumerate(lst_pdf):
    print(f"Processing PDF {idx_pdf + 1} of {len(lst_pdf)}")
    entity = PDF(original_pdf_name=pdf.name)
    try:
        reader = PdfReader(pdf)
        text = []
        # Extract the 2 first pages (this can vary, can play with this number)
        for page in reader.pages[0:NUM_PAGES_TO_EXTRACT]:
            text.append(page.extract_text())
        formatted_txt = "".join(text)  # from list to string
        # Send to OpenAI
        message = extract_from_openai(formatted_txt)
        #
        json_content = json.loads(message.content)
        entity.title_from_openai = json_content["title"]
        entity.keywords_from_openai = json_content["keywords"]
        # Check the metadata
        if reader.metadata == {}:
            print("no metadata found")
        else:
            print("found metadata")
            if "/Title" in reader.metadata.keys():
                entity.title_from_metadata = reader.metadata["/Title"]
            if "/Keywords" in reader.metadata.keys():
                entity.keywords_from_metadata = reader.metadata[
                    "/Keywords"
                ]  # .split(",")
                # entity.keywords_from_metadata = entity.keywords_from_metadata
    except Exception as e:
        print(e)
        entity.process_error = True
    #
    entity_lst.append(entity)

Processing PDF 1 of 104
found metadata
Processing PDF 2 of 104
found metadata
Processing PDF 3 of 104
found metadata
Processing PDF 4 of 104
found metadata
Processing PDF 5 of 104
found metadata
Processing PDF 6 of 104
found metadata
Processing PDF 7 of 104
found metadata
Processing PDF 8 of 104
found metadata
Processing PDF 9 of 104
found metadata
Processing PDF 10 of 104
found metadata
Processing PDF 11 of 104
found metadata
Processing PDF 12 of 104
found metadata
Processing PDF 13 of 104
found metadata
Processing PDF 14 of 104
found metadata
Processing PDF 15 of 104
found metadata
Processing PDF 16 of 104
found metadata
Processing PDF 17 of 104
found metadata
Processing PDF 18 of 104
found metadata
Processing PDF 19 of 104
found metadata
Processing PDF 20 of 104
found metadata
Processing PDF 21 of 104
found metadata
Processing PDF 22 of 104
found metadata
Processing PDF 23 of 104
found metadata
Processing PDF 24 of 104
found metadata
Processing PDF 25 of 104
found metadata
Processin

In [58]:
output_json = {"entities": [ent.__dict__ for ent in entity_lst]}
with open("entity_lst.json", "w") as fn:
    json.dump(output_json, fn, indent=4)

## Unused code

In [None]:
splitlines = text.splitlines()
for idx, line in enumerate(splitlines):  # splitlines:
    if (line.split(" ")[0] == "KEY") | (line.split(" ")[0] == "KEYWORDS"):
        keyword_line = splitlines[idx] + splitlines[idx + 1]
        #
        print(keyword_line.replace("-", "").split(":")[1])
        keyword_lst = keyword_line.replace("-", "").split(":")[1].split(", ")
        keyword_lst = [kl.strip() for kl in keyword_lst]
        print(keyword_lst)