In [2]:
import pdf2image
import PIL
import pytesseract
from pathlib import Path
import yaml
import cv2
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import itertools

In [16]:
DATA_PATH = Path.cwd() / "receipts"

In [3]:
# read config from config.yml
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

In [30]:
COORDINATES = {
    "price": [780, 410, 930, 833],
}

In [25]:
from typing import Any, List


class Receipt:
    def __init__(self, path):
        self.path: str = path
        self.image = pdf2image.convert_from_path(self.path, dpi=config["dpi"])[0]
        self.df_receipt = pd.DataFrame(
            pytesseract.image_to_data(
                self.image, 
                output_type=pytesseract.Output.DICT, 
                config=f"--psm {config['tesseract']['psm']['receipt']}"
            )
        )
        self.df_receipt = self._get_receipt_data()
        self.df_price = self._get_df_price()
    
    def _get_df_price(self):
        price_image = self.image.crop((COORDINATES["price"][0], 0, COORDINATES["price"][2], self.image.height))
        # sage image to disk
        price_image.save(DATA_PATH / "price.png")
        self.image.save(DATA_PATH / "image.png")
        price_data = pd.DataFrame(
            pytesseract.image_to_data(
                price_image, 
                output_type=pytesseract.Output.DICT, 
                config=f"--psm {config['tesseract']['psm']['price']}"
            )
        )
        return price_data


    def _get_receipt_data(self):
        index_first = self.df_receipt.loc[self.df_receipt["text"].str.match(config["receipt"]["first-word"], case=False)].index[0]
        index_last = self.df_receipt.loc[self.df_receipt["text"].str.match(config["receipt"]["last-word"], case=False)].index[0]
        return self.df_receipt.truncate(before=index_first, after=index_last)

    def get_price(self, top_pixel: int):
        price = self.df_price.loc[(self.df_price["top"] - top_pixel) < config["receipt"]["pixel-distance"], "text"]
        return price


class Item:
    def __init__(self, tesseract_data: List):
        self.tesseract_data = tesseract_data
        self.text = self.get_text()
        self.price = self.get_price()

    def get_text(self):
        return " ".join([row["text"] for row in self.tesseract_data])


def is_float(element: Any) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False


In [29]:
def read_receipts(path: Path) -> List[Receipt]:
    receipts = []
    for path in path.glob("*.pdf"):
        receipts.append(Receipt(path))
    return receipts


receipts = read_receipts(DATA_PATH)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [21]:
receipts[4].df_receipt

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
14,5,1,1,1,4,1,48,419,133,46,92,AANTAL
15,5,1,1,1,4,2,333,423,263,30,87,OMSCHRIJVING
16,5,1,1,1,4,3,746,423,94,30,89,PRUS
17,5,1,1,1,4,4,896,423,137,30,80,BEDRAG
18,4,1,1,1,5,0,332,523,704,46,-1,
19,5,1,1,1,5,1,332,523,238,46,92,BONUSKAART
20,5,1,1,1,5,2,912,527,124,30,88,xx0894
21,4,1,1,1,6,0,55,588,980,36,-1,
22,5,1,1,1,6,1,55,588,16,30,96,1
23,5,1,1,1,6,2,330,588,47,30,92,AH


rows/items are roughly 64-70 pixels apart from each other (bounding box top left to bounding box top left of next item)

In [22]:
pd.set_option('display.max_rows', 200)
def get_item_data(receipt: Receipt) -> dict:
    df_receipt = pd.DataFrame(receipt.tesseract_data)
    df_price = pd.DataFrame(receipt.prices)
    result = []
    index_first = df_receipt.loc[df_receipt["text"].str.contains(config["receipt"]["first-word"], case=False)].index[0]
    index_last = df_receipt.loc[df_receipt["text"].str.match(config["receipt"]["last-word"], case=False)].index[0]
    df_receipt = df_receipt.truncate(before=index_first, after=index_last)

    tmp_element = []
    for row_1 in df_receipt.iterrows():
        row_1_top_pixel = row_1[1]["top"]
        df_price_row = df_price.loc[abs(df_price["top"] - row_1_top_pixel) < config["receipt"]["pixel-distance"]]
        if df_price_row is not None:
            for row_price in df_price_row.iterrows():
                row_price_text = row_price[1]["text"].replace(",", ".")
                if is_float(row_price_text) and row_price_text.strip() != "" and row_price_text.strip() is not None:
                    tmp_element.append(row_price_text + "hehehe")
        if row_1[1]["text"].strip() == "" and tmp_element != []:
            result.append(tmp_element)
            tmp_element = []
            continue
        tmp_element.append(row_1[1]["text"])
            


        # row_1_top_pixel = row_1[1]["top"]
        # result[row_1_top_pixel] = []
        # df_price_row = df_price.loc[abs(df_price["top"] - row_1_top_pixel) < config["receipt"]["pixel-distance"]]
        # if df_price_row is not None:
        #     for row_price in df_price_row.iterrows():
        #         row_price_text = row_price[1]["text"].replace(",", ".")
        #         if is_float(row_price_text) and row_price_text.strip() != "" and row_price_text.strip() is not None:
        #             result[row_1_top_pixel].append(row_price_text + "hehehe")
        # for row_2 in df_receipt.iterrows():
        #     top_pixel_diff = abs(row_1_top_pixel - row_2[1]["top"])
        #     row_text = row_2[1]["text"]
        #     if top_pixel_diff < config["receipt"]["pixel-distance"] and row_text != "":
        #         result[row_1_top_pixel].append(row_text)


    return result
receipt_2_data = get_item_data(receipt_2)
receipt_1_data = get_item_data(receipt_1)
receipt_3_data = get_item_data(receipt_3)

# print(pytesseract.image_to_string(receipt_1.image, config=f"--psm 6"))
receipt_1_data


[['AANTAL', 'OMSCHRIJVING', 'PRIUS', 'BEDRAG'],
 ['BONUSKAART', 'xx5571'],
 ['1', 'AH', 'WRAP', '3,75', '3.98hehehe'],
 ['3.98hehehe',
  '0.367KG',
  '3.98hehehe',
  'AH',
  '3.98hehehe',
  'PLUOT',
  '3.98hehehe',
  'LOS',
  '3.98hehehe',
  '3,98',
  '3.98hehehe',
  '1,46'],
 ['1', 'FRAMBOZEN', '2,89'],
 ['1', 'AH', 'BRAMEN', '2,79', 'B'],
 ['1', 'ARIZONA', '1,25'],
 ['+STATIEGELD', '0,15'],
 ['5', 'SUBTOTAAL', '12,29'],
 ['BONUS', 'AHBRAMEN', '-0,50']]