## convert from existing format (sparrow labling) to desired format (CORD dataset)

In [7]:
import os
import json

from tqdm import tqdm
from PIL import Image
import jsonlines


In [8]:
#first store all data to one array, then transform to train/val/test



In [9]:
#Following the example data from https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/Donut/CORD/Fine_tune_Donut_on_a_custom_dataset_(CORD)_with_PyTorch_Lightning.ipynb#scrollTo=ok4IudPaFhqi

#read data
labeledReceipts = os.listdir("./docs/json")
labeledReceipts.remove("key") #do not want the key folder



def getMenuListAndTotal(jsonObj):
    menuList = [] # list of objects : {"nm": "Bbk Bengil Nasi", "cnt": "", "price": "125,000"}   -> cnt is empty string always
    # print(jsonObj["words"])
    total = 0

    uniqueRowLabels = set() # set of unique rows
    for box in jsonObj["words"]:
        uniqueRowLabels.add(box["label"])
    if (len(uniqueRowLabels) == 0):
        return None, None
    maxRow = max(list(uniqueRowLabels)) #to prevent adding the total as a menu item
    
    for rowLabel in uniqueRowLabels:
        #find item with rowLabel
        items = []
        for box in jsonObj["words"]:
            if box["label"] == rowLabel:
                items.append([box, box["rect"]["x1"]])
        
        if (items[0][1] < items[1][1]): #product is further to the left -> smaller x1
            productBox, priceBox = items[0][0], items[1][0]
        else:
            productBox, priceBox = items[1][0], items[0][0]

        if (rowLabel == maxRow): #last row is the total, dont want to add that
            total = priceBox["value"]
        else:
            menuList.append({"nm": productBox["value"], "cnt": "", "price": priceBox["value"]})

        #add bounding boxes

    return menuList, total

def getTempFromBox(box):
    #if point1 is bot left, point2 is bot right ,point3 is top right, point4 is top left
    x4,x2,y4,y2 = box["rect"]["x1"], box["rect"]["x2"], box["rect"]["y1"], box["rect"]["y2"] #we have bot left and top right corner
    x1,y1 = x4, y2
    x3,y3 = x2, y4
    #swap 2 and 3
    x2,y2,x3,y3 = x3,y3,x2,y2
    #swap 1 and 4
    x1,y1,x4,y4 = x4,y4,x1,y1

    temp = {"words" : [{"quad": 
            {""
            "x2": x2,
            "y3": y3,
            "x3": x3,
            "y4": y4,
            "x1": x1,
            "y1": y1,
            "x4": x4,
            "y2": y2},
            "is_key": 0, #always 0
            "row_id": box["label"],
            "text": box["value"]
            }],
            "category": "menu",
            "group_id": box["label"],
            "sub_group_id": 0, #always 0
            }
    return temp



def getFullImgPath(imageName):
    path = "./docs/images/" + imageName
    if (os.path.exists(path)):
        return path
    print("DID NOT FIND PICTURE WITH NAME: " + imageName)
    return None

def prepareData():
    """prepares all data"""
    dictArr = []
    for idx, receipt in tqdm(enumerate(labeledReceipts)):
        newJsonObject = prepareReceipt(receipt, index=idx)
        if (newJsonObject != None):
            dictArr.append(newJsonObject)
    return dictArr

def prepareReceipt(receipt, index):
    #read json file
    jsonObj = json.load(open("./docs/json/" + receipt))

    newJsonObj = {}

    width, height = jsonObj["meta"]["image_size"]["width"], jsonObj["meta"]["image_size"]["height"]
    if (width > height):
        print("IGNORING RECEIPT SINCE IT IS NOT IN PORTRAIT MODE")
        return None
    # newJsonObj["meta"] = jsonObj["meta"]  #NOTE not needed
    
    menuList, total = getMenuListAndTotal(jsonObj)
    if (menuList == None):
        return None
    newJsonObj["gt_parse"] = {"menu":menuList, "total": {"total_price": total}} #adding menu list and total

    validLineArr = []
    for box in jsonObj["words"]:
        temp = getTempFromBox(box)
        validLineArr.append(temp)
        
    newJsonObj["valid_line"] = validLineArr
    newJsonObj["roi"] = dict()
    newJsonObj["repeating_symbol"] = dict()
    newJsonObj["dontcare"] = dict()

    imageName = receipt.split(".")[0]+".jpg"
    imgPath = getFullImgPath(imageName)
    img = Image.open(imgPath)
    fullDict = {"file_name":str(index)+".jpg", "ground_truth": json.dumps(newJsonObj)}
    
    return fullDict
        

In [10]:
dataset = prepareData()
dataset

3it [00:00, 88.31it/s]

IGNORING RECEIPT SINCE IT IS NOT IN PORTRAIT MODE





[{'file_name': '1.jpg',
  'ground_truth': '{"gt_parse": {"menu": [{"nm": "LETTMELK 0,5 %", "cnt": "", "price": "18,90"}, {"nm": "LETTMELK 0,5 %", "cnt": "", "price": "18,90"}], "total": {"total_price": "12,40"}}, "valid_line": [{"words": [{"quad": {"x2": 1073, "y3": 1259, "x3": 1073, "y4": 1259, "x1": 319, "y1": 1132, "x4": 319, "y2": 1132}, "is_key": 0, "row_id": "1", "text": "LETTMELK 0,5 %"}], "category": "menu", "group_id": "1", "sub_group_id": 0}, {"words": [{"quad": {"x2": 2308, "y3": 1228, "x3": 2308, "y4": 1228, "x1": 2051, "y1": 1116, "x4": 2051, "y2": 1116}, "is_key": 0, "row_id": "1", "text": "18,90"}], "category": "menu", "group_id": "1", "sub_group_id": 0}, {"words": [{"quad": {"x2": 1058, "y3": 1351, "x3": 1058, "y4": 1351, "x1": 346, "y1": 1255, "x4": 346, "y2": 1255}, "is_key": 0, "row_id": "2", "text": "LETTMELK 0,5 %"}], "category": "menu", "group_id": "2", "sub_group_id": 0}, {"words": [{"quad": {"x2": 2301, "y3": 1336, "x3": 2301, "y4": 1336, "x1": 2047, "y1": 1232,

In [11]:
DATA_FOLDER_NAME = "TEST_DATA"

In [12]:
valIdx = int(len(dataset) * 0.8)
testIdx = int(len(dataset) * 0.9)

#split dataset into train/validation/test
train_dataset = dataset[:valIdx]
val_dataset = dataset[valIdx:testIdx]
test_dataset = dataset[testIdx:]

#create dirs if it does not exist
if not os.path.isdir(DATA_FOLDER_NAME):
	os.mkdir(DATA_FOLDER_NAME)

for subfolder in ["train", "validation", "test"]:
    if not os.path.isdir(f"{DATA_FOLDER_NAME}/{subfolder}"):
        os.mkdir(f"{DATA_FOLDER_NAME}/{subfolder}") 

#train
with jsonlines.open(f"{DATA_FOLDER_NAME}/train/metadata.jsonl", "w") as writer:
    writer.write_all(train_dataset)
#push all images to folder
for data in tqdm(train_dataset):
    imgName = data["file_name"]
    imgPath = getFullImgPath(imgName)
    print(imgName)
    img = Image.open(imgPath)
    img.save(f"{DATA_FOLDER_NAME}/train/{imgName}")

#val
with jsonlines.open(f"{DATA_FOLDER_NAME}/validation/metadata.jsonl", "w") as writer:
    writer.write_all(val_dataset)
#push all images to folder
for data in tqdm(val_dataset):
    imgName = data["file_name"]
    imgPath = getFullImgPath(imgName)
    img = Image.open(imgPath)
    img.save(f"{DATA_FOLDER_NAME}/val/{imgName}")
#test
with jsonlines.open(f"{DATA_FOLDER_NAME}/test/metadata.jsonl", "w") as writer:
    writer.write_all(test_dataset)
#push all images to folder
for data in tqdm(test_dataset):
    imgName = data["file_name"]
    imgPath = getFullImgPath(imgName)
    if (imgPath is not None):
        img = Image.open(imgPath)
        img.save(f"{DATA_FOLDER_NAME}/test/{imgName}")
    


100%|██████████| 1/1 [00:00<00:00, 11.00it/s]


1.jpg


0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  8.10it/s]


In [13]:
dataset

[{'file_name': '1.jpg',
  'ground_truth': '{"gt_parse": {"menu": [{"nm": "LETTMELK 0,5 %", "cnt": "", "price": "18,90"}, {"nm": "LETTMELK 0,5 %", "cnt": "", "price": "18,90"}], "total": {"total_price": "12,40"}}, "valid_line": [{"words": [{"quad": {"x2": 1073, "y3": 1259, "x3": 1073, "y4": 1259, "x1": 319, "y1": 1132, "x4": 319, "y2": 1132}, "is_key": 0, "row_id": "1", "text": "LETTMELK 0,5 %"}], "category": "menu", "group_id": "1", "sub_group_id": 0}, {"words": [{"quad": {"x2": 2308, "y3": 1228, "x3": 2308, "y4": 1228, "x1": 2051, "y1": 1116, "x4": 2051, "y2": 1116}, "is_key": 0, "row_id": "1", "text": "18,90"}], "category": "menu", "group_id": "1", "sub_group_id": 0}, {"words": [{"quad": {"x2": 1058, "y3": 1351, "x3": 1058, "y4": 1351, "x1": 346, "y1": 1255, "x4": 346, "y2": 1255}, "is_key": 0, "row_id": "2", "text": "LETTMELK 0,5 %"}], "category": "menu", "group_id": "2", "sub_group_id": 0}, {"words": [{"quad": {"x2": 2301, "y3": 1336, "x3": 2301, "y4": 1336, "x1": 2047, "y1": 1232,