In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pdfplumber
import os
import base64
import re
import tkinter as tk
from tkinter import filedialog
from openai import OpenAI
import pandas as pd
from PyPDF2 import PdfFileReader

In [10]:
def get_folder_path():
    """
    Opens a file dialog to allow the user to select a folder from their system.

    Returns:
        str: The file path of the selected folder.
    """
    root = tk.Tk()
    root.withdraw()
    return filedialog.askdirectory()

def get_relevant_sustainability_report_pages(path, search_terms):
    """
    Extracts pages from a PDF file that contain specified search terms.

    Parameters:
        path (str): The file path of the PDF document.
        search_terms (list of str): A list of terms to search within the PDF.

    Returns:
        dict: A dictionary where keys are search terms and values are sets of page numbers where these terms were found.
    """
    company_name = os.path.splitext(os.path.basename(path))[0]
    
    with pdfplumber.open(path) as pdf:
        
        # Print some information about the PDF
        print(f"{company_name} - Length of report: {len(pdf.pages)} pages")
        
        terms_pages = {}  # Dictionary to store terms and their corresponding pages as sets

        # Initialize dictionary with each term and an empty set
        for term in search_terms:
            terms_pages[term] = set()

        # Loop through all the pages
        for i in range(len(pdf.pages)):

#             print("Page: ", i+1)

            page = pdf.pages[i]
            text = page.extract_text().lower()
            
            # Check if any of the terms are in the text
            for term in search_terms:
                if term in text:
                    print(f"Term '{term}' found on page {i+1}")
                    terms_pages[term].add(i + 1)  # Add page number to the corresponding term's set

        return terms_pages

def save_pages_as_images(path, terms_pages):
    """
    Saves specific pages of a PDF file as images in a newly created folder.

    Parameters:
        path (str): The file path of the PDF document.
        terms_pages (dict): A dictionary with terms as keys and sets of page numbers as values.

    Returns:
        list of str: A list of paths to the saved image files.
    """


    # Initialize list to store the paths of the saved images
    saved_images = []

    # Extract the file name without extension and create a new folder
    folder_name = os.path.splitext(os.path.basename(path))[0]
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    with pdfplumber.open(path) as pdf:
        for term, pages in terms_pages.items():
            for page_number in pages:
                page = pdf.pages[page_number - 1]
                image = page.to_image(resolution=300)
                image_path = os.path.join(folder_name, f"page_{page_number}_{term.replace(' ', '_')}.png")
                image.save(image_path, format="PNG")
                saved_images.append(image_path)

    return saved_images


def encode_image_to_base64(image_path):
    """
    Encodes an image file to a base64 string.

    Parameters:
        image_path (str): The file path of the image to be encoded.

    Returns:
        str: The base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_and_query_emission_intensity(pdf_folder, api_key, search_terms):
    """
    Extracts relevant pages from a PDF, saves them as images, and queries an API to find emission intensities from these images.

    Parameters:
        pdf_path (str): The file path of the PDF document.
        api_key (str): API key for the OpenAI service.
        search_terms (list of str): Terms to search within the PDF for relevant pages.

    Returns:
        None: Prints out the values you are looking for.
    """    
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
    
            # Extract relevant pages and save them as images
            terms_pages = get_relevant_sustainability_report_pages(pdf_path, search_terms)
            saved_images = save_pages_as_images(pdf_path, terms_pages)

            # Prepare the message content with all the images
            message_content = [
                {
                    "type": "text",
                    "text": ("") # Message containing instructions and what to extract
                }
            ]

            # Add all images to the message content
            for image_path in saved_images:
                base64_image = encode_image_to_base64(image_path)
                message_content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                })

            # Create the OpenAI client
            client = OpenAI(api_key=api_key)

            # Send the request
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": message_content}],
                max_tokens=300,
            )

            # Extract the values from the response
            response_text = response.choices[0].message.content
            print("Response Text:", response_text)
            
            extraction_tag = re.findall(r"<>", response_text) # Put in tags what it is you want returned

            # Print the results
            for result in extraction_tag:
                print(f"Extracted: {result}")

In [None]:
api_key = ""

common_terms = []

pdf_folder = get_folder_path()

extract_and_query(pdf_folder, api_key, common_terms)