In [1]:
# Import the parser and openai packages
import PyPDF2
import openai
import os

In [2]:
# Retrieve your api key from env var
openai.api_key =  os.environ.get("OPENAI_API_KEY")


In [3]:
# Get list of files in sub dir
files = os.listdir('./files')
file_paths = ["files/" + file for file in files]
#file_paths

['files/ENOVOS1.PDF',
 'files/Nr. 3910018052 vom 2023-06-22.pdf',
 'files/Nr. 3910018053 vom 2023-06-22.pdf',
 'files/Nr. 3910018056 vom 2023-06-22.pdf',
 'files/Nr. 3910018057 vom 2023-06-22.pdf',
 'files/Nr. 3910018058 vom 2023-06-22.pdf',
 'files/Nr. 3910018059 vom 2023-06-22.pdf',
 'files/Nr. 3910018060 vom 2023-06-22.pdf',
 'files/Nr. 3910018062 vom 2023-06-22.pdf',
 'files/Nr. 3910018063 vom 2023-06-22.pdf',
 'files/Nr. 3910018064 vom 2023-06-23.pdf',
 'files/Nr. 3910018066 vom 2023-06-23.pdf',
 'files/Nr. 3910018067 vom 2023-06-23.pdf',
 'files/PW02.pdf',
 'files/S000266314-VERT-KOPIE.PDF',
 'files/S000266751-VERT-KOPIE.PDF',
 'files/S000266752-VERT-KOPIE.PDF',
 'files/S000266753-VERT-KOPIE.PDF',
 'files/S000266754-VERT-KOPIE.PDF',
 'files/S000266755-VERT-KOPIE.PDF',
 'files/S000266756-VERT-KOPIE.PDF',
 'files/UK1.pdf']

In [4]:
# Function to extract the texts from provided pdfs
def extract_text(file_path: str) -> str:
    
    # Open the PDF files in binary mode 
    with open(file_path, 'rb') as pdf_file:    
        
        # Create a PdfReader object and read the PDF file
        pdf_reader = PyPDF2.PdfReader(pdf_file)
    
        # Initialize an empty string to store the extracted text
        text = ""
        
        # Iterate over each page in the PDF and extract the text
        for page in pdf_reader.pages:
            extracted_text = page.extract_text()
            
            # Check if the extracted text exceeds the limit
            if len(text) + len(extracted_text) <= 5000:
                text += extracted_text
            else:
                remaining_chars = 5000 - len(text)
                text += extracted_text[:remaining_chars]
                break  # Stop extracting text from subsequent pages
    return text

In [5]:
# List of text to pass to GPT
texts = [extract_text(file_path) for file_path in file_paths]
# text = extract_text('files/TEST1.pdf')
# Enter the prompt here
prompt =  "What is the invoice date, invoice number, invoice period (give month), sender, recipient, total amount due (currency), the energy price in (currency)/kWh (for day and night rates if applicable), and the energy consumption in kWh of this text that is extracted from an energy invoice? Please give the result in a list."
#texts

['Kundenbetreuung/Abrechnung:\nTeam\xa0Billing\nT\xa0+49\xa0(0)\xa0681/8105\xad270\nF\xa0+49\xa0(0)\xa0681/8105\xad361\nteambilling\xadeed@enovos.eu\n\xa0\nEnovos\xa0Energie\xa0Deutschland\xa0GmbH,\xa0Postfach\xa014\xa009,\xa065004\xa0Wiesbaden\xa0\nProgroup\xa0Board\xa0GmbH\nProwell\xadStr.\xa01\xa0\n76877\xa0Offenbach\nRECHNUNG\nLeistungsempfänger: Progroup\xa0Board\xa0GmbH Debitorennummer: 200055913\nLieferstelle: Prowellstraße\xa01,\xa076877\xa0Offenbach Rechnungsdatum: 25.05.2023\nMarktlokations\xadID: 51282741615 Zahlbar\xa0bis: 15.06.2023\nLeistungszeitraum: 01.04.2023\xa0bis\xa030.04.2023 USt\xadIdNr:\nRechnungsnummer: S000252896\nBitte\xa0bei\xa0Zahlung\xa0immer\xa0die\xa0Rechnungsnummer\xa0angeben.\xa0Sollten\xa0Sie\xa0mehr\xa0als\xa03\xa0Rechnungen\xa0anweisen,\xa0\nsenden\xa0Sie\xa0die\xa0Avise\xa0bitte\xa0an:\xa0avise@enovos.eu\nBezeichnung Betrag\xa0in\xa0€\n\u2007\u2007\u2007Energielieferung\xa0Strom 53.176,76\nNetznutzung 14.545,22\nBlindenergie 0,00\n\u2007\nTotal\xa0E

In [6]:
# Function to prompt and pass the texts to the openai api
def gpt_completion(user_prompt: str, user_content: str) -> dict:
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are an expert for energy invoices. You assist in extracting the correct information from a given invoice."},
        {"role": "user", "content": user_prompt + user_content}
      ]
    )
    gpt_message = completion.choices[0].message
    return gpt_message
# print(gpt_completion(prompt, texts[0]))

In [7]:
# Loop trough texts and list the messages
messages = [gpt_completion(prompt, text) for text in texts]
messages

[<OpenAIObject at 0x7f5ab810ac90> JSON: {
   "role": "assistant",
   "content": "Here is the information extracted from the energy invoice:\n\n- Invoice date: 25.05.2023\n- Invoice number: S000252896\n- Invoice period: April 2023\n- Sender: Enovos Energie Deutschland GmbH, Postfach 14 09, 65004 Wiesbaden\n- Recipient: Progroup Board GmbH, Prowell-Str. 1, 76877 Offenbach\n- Total amount due: \u20ac98,524.82\n- Energy price in \u20ac/kWh (Energielieferung Strom):\n   - Day rate: \u20ac111.09/MWh\n   - Night rate: \u20ac0.33/MWh\n- Energy consumption in kWh:\n   - Energielieferung (T): 477,264 kWh\n   - Zuschlag auf Arbeitspreis (T): 477,264 kWh\n   - Wirkarbeit: 477,264 kWh\n- Energy consumption in MWh (Netznutzung):\n   - Wirkarbeit: 477,264 MWh\n   - Leistung 01.01.2023 - 31.03.2023: -1,101 MW\n   - Leistung 01.01.2023 - 31.03.2023: 1,101 MW\n   - Leistung 01.04.2023 - 30.04.2023: 1,101 MW\n- Abgaben/Steuern:\n   - KWK-Umlage: \u20ac3.57/MWh\n   - Summe NEV-Umlage: \u20ac0.50/MWh\n   -