In [27]:
file_path = 'API_KEY.txt'

with open(file_path, 'r') as file:
        api_key = file.read()

In [28]:
import os

# get all file names of receipts in New folder -> receipts that need to be extracted. 
folder_path = 'fotos/New'
file_names = os.listdir(folder_path)
print(file_names)

['gamma.jpg', 'hema.jpg', 'kruidvat.jpg', 'vomar.jpg']


In [37]:
import base64
import requests
import json
import re
import pandas as pd

# functions

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  

prompt = """Geef informatie over de aankopen op de bon terug. Dit is een voorbeeld van de output, volg het zelfde format:
{"winkel": "Vomar",
    "producten": [
    {
        "beschrijving": "Wokkels Naturel",
        "aantal": 1,
        "totaal prijs": 2.69
    },
    {
        "beschrijving": "Unox Groente Soep",
        "aantal": 3,
        "totaal prijs": 4.8
    }
    ],
    "totaal prijs": 11.03,
    "totaal aantal producten": 4,
    "datum": "20-12-2024",
    "tijd": 13:34,
    "winkel categorie": "Supermarkt"}

Per product, geef het totaal bedrag en geef alleen het totaal bedrag met korting er van af.
"""
  
# prompt chatgpt
def prompt_chatgpt(base64_image):
  headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
  }

  payload = {
    "model": "gpt-4o-mini",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": prompt
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 300
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  json_response = response.json()
  txt_response = json_response['choices'][0]['message']['content']
  return txt_response


def extract_csv_from_response(response):
    # remove unnecesary character before and after {}
    response = re.search(r'\{.*\}', response, re.DOTALL).group()
    # convert string into json
    data = json.loads(response)

    # convert json into df
    df = pd.DataFrame(data['producten'])
    df['winkel'] = data['winkel']
    df['datum'] = data['datum']
    df['tijd'] = data['tijd']
    df['winkel categorie'] = data['winkel categorie']
    df['prijs per product*'] = df['totaal prijs']/df['aantal']
    df = df[['winkel', 'datum', 'tijd', 'beschrijving', 'prijs per product*', 'aantal', 'totaal prijs', 'winkel categorie']]

    return df 


In [45]:

# for each foto of receipt extract information.
for receipt in file_names[3:4]:
    # encode each foto
    image_path = f"{folder_path}/{receipt}"
    base64_image = encode_image(image_path)

    # prompt chatgpt to extract information from receipt
    response = prompt_chatgpt(base64_image)

    df_products = extract_csv_from_response(response)

    if os.path.exists("extracted_data/records_products.csv"):
        previous_records = pd.read_csv("extracted_data/records_products.csv")
        df_products = pd.concat([previous_records, df_products])
    
    df_products.to_csv("extracted_data/records_products.csv", index=False)
