In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pdfplumber
import os
import base64
import re
import tkinter as tk
from tkinter import filedialog
from openai import OpenAI
import pandas as pd
from PyPDF2 import PdfFileReader

In [10]:
def get_folder_path():
    """
    Opens a file dialog to allow the user to select a folder from their system.

    Returns:
        str: The file path of the selected folder.
    """
    root = tk.Tk()
    root.withdraw()
    return filedialog.askdirectory()

def get_relevant_sustainability_report_pages(path, search_terms):
    """
    Extracts pages from a PDF file that contain specified search terms.

    Parameters:
        path (str): The file path of the PDF document.
        search_terms (list of str): A list of terms to search within the PDF.

    Returns:
        dict: A dictionary where keys are search terms and values are sets of page numbers where these terms were found.
    """
    company_name = os.path.splitext(os.path.basename(path))[0]
    
    with pdfplumber.open(path) as pdf:
        
        # Print some information about the PDF
        print(f"{company_name} - Length of report: {len(pdf.pages)} pages")
        
        terms_pages = {}  # Dictionary to store terms and their corresponding pages as sets

        # Initialize dictionary with each term and an empty set
        for term in search_terms:
            terms_pages[term] = set()

        # Loop through all the pages
        for i in range(len(pdf.pages)):

#             print("Page: ", i+1)

            page = pdf.pages[i]
            text = page.extract_text().lower()
            
            # Check if any of the terms are in the text
            for term in search_terms:
                if term in text:
                    print(f"Term '{term}' found on page {i+1}")
                    terms_pages[term].add(i + 1)  # Add page number to the corresponding term's set

        return terms_pages

def save_pages_as_images(path, terms_pages):
    """
    Saves specific pages of a PDF file as images in a newly created folder.

    Parameters:
        path (str): The file path of the PDF document.
        terms_pages (dict): A dictionary with terms as keys and sets of page numbers as values.

    Returns:
        list of str: A list of paths to the saved image files.
    """


    # Initialize list to store the paths of the saved images
    saved_images = []

    # Extract the file name without extension and create a new folder
    folder_name = os.path.splitext(os.path.basename(path))[0]
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    with pdfplumber.open(path) as pdf:
        for term, pages in terms_pages.items():
            for page_number in pages:
                page = pdf.pages[page_number - 1]
                image = page.to_image(resolution=300)
                image_path = os.path.join(folder_name, f"page_{page_number}_{term.replace(' ', '_')}.png")
                image.save(image_path, format="PNG")
                saved_images.append(image_path)

    return saved_images


def encode_image_to_base64(image_path):
    """
    Encodes an image file to a base64 string.

    Parameters:
        image_path (str): The file path of the image to be encoded.

    Returns:
        str: The base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_and_query_emission_intensity(pdf_folder, api_key, search_terms):
    """
    Extracts relevant pages from a PDF, saves them as images, and queries an API to find emission intensities from these images.

    Parameters:
        pdf_path (str): The file path of the PDF document.
        api_key (str): API key for the OpenAI service.
        search_terms (list of str): Terms to search within the PDF for relevant pages.

    Returns:
        None: Prints out the emission intensities and their units if found.
    """    
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            base_name = os.path.splitext(pdf_file)[0]
            parts = base_name.rsplit('_')
            company_name = parts[0].lower().strip()
    
    
            # Extract relevant pages and save them as images
            terms_pages = get_relevant_sustainability_report_pages(pdf_path, search_terms)
            saved_images = save_pages_as_images(pdf_path, terms_pages)

            # Prepare the message content with all the images
            message_content = [
                {
                    "type": "text",
                    "text": ("The images are screenshots from a company's sustainability report. I am interested in extracting the companies emission intensity (that is, their GHG emissions per tonne or kg of production), which is usually a number between 0.1 and 2 (when presented in t CO2e per t product), or between 100 and 2000 (when presented in kg CO2 per t product). If years are given, please find the emission intensity for the latest year. This information might sit in a bar chart or line graph. Can you please extract the emission intensity and its unit from the text in these images? Return it in tags (e.g. <emission_intensity>0.74</emission_intensity><unit>t CO2e / t</unit>, or <emission_intensity>740</emission_intensity><unit>kg CO2e / t</unit>) (If you cannot find the emission intensity, return <emission_intensity>N/A</emission_intensity>).\n"
                             "Emission intensity can also be refereed to as:\n"
                             "Carbon Intensity,Greenhouse Gas Intensity (GHG Intensity),Emissions Per Unit of Production,CO2 Emissions per Dollar of Revenue,Carbon Footprint per Unit,Emissions Ratio,CO2 Emissions per Energy Produced,Carbon Efficiency,Carbon Intensity Metric,CO2 Intensity"
                            )
                }
            ]

            # Add all images to the message content
            for image_path in saved_images:
                base64_image = encode_image_to_base64(image_path)
                message_content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                })

            # Create the OpenAI client
            client = OpenAI(api_key=api_key)

            # Send the request
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": message_content}],
                max_tokens=300,
            )

            # Extract the emission intensity and unit from the response
            response_text = response.choices[0].message.content
            print("Response Text:", response_text)
            
            emission_intensities = re.findall(r"<emission_intensity>(.*?)</emission_intensity><unit>(.*?)</unit>", response_text)

            # Print the results
            for intensity, unit in emission_intensities:
                print(f"{company_name} - Emission intensity: {intensity} {unit}")

In [11]:
api_key = "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"

common_terms = [
            'emissions intensity',
            'emission intensity',
            'intensity figure',
            'scope 1 and 2 emissions intensity',
            'scope 1+2 emissions intensity',
            'intensity of energy consumption during production',
            'intensity of water consumption related to production',
            'ghg emission',
            'Carbon Intensity',
            'Greenhouse Gas Intensity',
            'GHG Intensity',
            'Emissions Per Unit of Production',
            'CO2 Emissions per Dollar of Revenue',
            'Carbon Footprint per Unit',
            'Emissions Ratio',
            'CO2 Emissions per Energy Produced',
            'Carbon Efficiency',
            'Carbon Intensity Metric',
            'CO2 Intensity'
        ]

pdf_folder = get_folder_path()

extract_and_query_emission_intensity(pdf_folder, api_key, common_terms)

2024-06-24 15:44:58.441 python[4107:104193] +[CATransaction synchronize] called within transaction
2024-06-24 15:44:58.560 python[4107:104193] +[CATransaction synchronize] called within transaction
2024-06-24 15:45:01.943 python[4107:104193] +[CATransaction synchronize] called within transaction


Foshan Haitian Flavouring and Food_Sustainability Report_2022 - Length of report: 76 pages
Term 'ghg emission' found on page 42
Response Text: The provided image contains text discussing energy conservation, emission reduction, and waste management for a company, but it does not contain specific figures for emission intensity (i.e., their GHG emissions per tonne or kg of production). The text mentions the reduction of GHG emissions in absolute terms and several projects that helped to achieve these reductions, but without relating these figures to the company's production volume, which is necessary to calculate the emission intensity.

Therefore, based on the available information in the image, the emission intensity cannot be determined:

<emission_intensity>N/A</emission_intensity>
Campbell Soup_Sustainability Report_2023 - Length of report: 21 pages
Term 'ghg emission' found on page 3
Response Text: <emission_intensity>N/A</emission_intensity>
Ingredion_Sustainability Report_2022 - 

Response Text: <emission_intensity>N/A</emission_intensity>
LINDT & SPRUENGLI_Sustainability Report_2023 - Length of report: 155 pages
Term 'ghg emission' found on page 9
Term 'ghg emission' found on page 61
Term 'ghg emission' found on page 70
Term 'ghg emission' found on page 72
Term 'ghg emission' found on page 106
Term 'ghg emission' found on page 130
Term 'ghg emission' found on page 138
Response Text: The relevant data to calculate the emission intensity can be found in these images. On one of the pages, it lists the total GHG emissions for the company as 3.7 million tonnes CO2 equivalent. On another page, it lists the total production in metric tons as 116,273 metric tons.

To calculate the emission intensity, you would divide the total GHG emissions by the total production:

Emission intensity = Total GHG emissions / Total production
Emission intensity = 3,700,000 tonnes CO2e / 116,273 metric tons
Emission intensity ≈ 31.83 t CO2e / metric ton product

Now we can represent this

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4-vision-preview in organization org-lnKIIWUzgB6GQfRtYE0EDJQ3 on tokens per min (TPM): Limit 10000, Requested 16351. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

**When there's a 'RateLimitError', you might want to leave that file out of the folder and return to it after. You could potentially add a for loop to deal with this error or reduce the common terms for that file**