## Import packages

In [4]:
import json
import os
from difflib import SequenceMatcher
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

## Define constants

In [17]:
# Define path to the SROIE dataset
SROIE_FOLDER_PATH = Path("../data/SROIE")
example_annotation = Path("../data/SROIE/training_data/annotations/X00016469612.txt")
example_entity = Path("../data/SROIE/training_data/entities/X00016469612.txt")

## Prepare data

In [94]:
def read_bbox_and_words(path: Path):
    bbox_and_words_list = []

    with open(path, 'r', errors="ignore") as f:
        for line in f.read().splitlines():
            if len(line) == 0:
                continue
        
            split_lines = line.split(",")

            bbox = np.array(split_lines[0:8], dtype=np.int32)
            text = ",".join(split_lines[8:])

            bbox_and_words_list.append([path.stem, *bbox, text])
    
    dataframe = pd.DataFrame(bbox_and_words_list, columns=["filename", "x0", "y0", "x1", "y1", "x2", "y2", "x3", "y3", "line"])
    dataframe = dataframe.drop(columns=["x1", "y1", "x3", "y3"])
    
    dataframe[["x0", "y0", "x2", "y2"]] = dataframe[["x0", "y0", "x2", "y2"]].astype(np.int16)

    return dataframe

def read_entities(path: Path):
    with open(path, 'r') as f:
        data = json.load(f)

    dataframe = pd.DataFrame([data])
    return dataframe

def assign_line_label(line: str, entities: pd.DataFrame):
    line_set = line.replace(",", "").strip().split()
    
    for i, column in enumerate(entities):
        entity_values = entities.iloc[0, i].replace(",", "").strip()
        entity_set = entity_values.split()
        matches_count = 0
        
        for l in line_set:
            if any(SequenceMatcher(a=l, b=b).ratio() > 0.8 for b in entity_set):
                matches_count += 1
            
            if (column.upper() == 'ADDRESS' and (matches_count / len(line_set)) >= 0.5) or \
               (column.upper() != 'ADDRESS' and (matches_count == len(line_set))) or \
               matches_count == len(entity_set):
                return column.upper()

    return "O"

def assign_labels(words: pd.DataFrame, entities: pd.DataFrame):
    max_area = {"TOTAL": (0, -1), "DATE": (0, -1)} 
    already_labeled = {"TOTAL": False,
                       "DATE": False,
                       "ADDRESS": False,
                       "COMPANY": False,
                       "O": False
    }

    labels = []
    
    for i, line in enumerate(words['line']):
        label = assign_line_label(line, entities)
        already_labeled[label] = True
        
        if (label == "ADDRESS" and already_labeled["TOTAL"]) or \
           (label == "COMPANY" and (already_labeled["DATE"] or already_labeled["TOTAL"])):
            label = "O"

        if label in ["TOTAL", "DATE"]:
            x0_loc = words.columns.get_loc("x0")
            bbox = words.iloc[i, x0_loc:x0_loc+4].to_list()
            area = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1])

            if max_area[label][0] < area:
                max_area[label] = (area, i)

            label = "O"

        labels.append(label)

    labels[max_area["DATE"][1]] = "DATE"
    labels[max_area["TOTAL"][1]] = "TOTAL"

    words["label"] = labels
    return words

def split_line(line: pd.Series):
    line_copy = line.copy()
    line_str = line_copy.loc["line"]
    
    words = line_str.split(" ")
    words = [word for word in words if len(word) >= 1]

    x0, y0, x2, y2 = line_copy.loc[['x0', 'y0', 'x2', 'y2']]
    bbox_width = x2 - x0

    new_lines = []
    
    for index, word in enumerate(words):
        x2 = x0 + int(bbox_width * len(word)/len(line_str))
        line_copy["x0"] = x0
        line_copy["x2"] = x2
        line_copy["line"] = word
        new_lines.append(line_copy.to_list())
        x0 = x2 + 5 

    return new_lines

def dataset_creator(folder: Path):
    bbox_folder = folder / "annotations"
    entities_folder = folder / "entities"
    img_folder = folder / "images"

    entities_files = sorted(entities_folder.glob("*.txt"))
    bbox_files = sorted(bbox_folder.glob("*.txt"))
    img_files = sorted(img_folder.glob("*.jpg"))

    data = []

    print("Reading dataset:")
    
    for bbox_file, entities_file, img_file in tqdm(zip(bbox_files, entities_files, img_files), total=len(bbox_files)):            
        bbox = read_bbox_and_words(bbox_file)
        entities = read_entities(entities_file)
        image = Image.open(img_file)

        bbox_labeled = assign_labels(bbox, entities)
        del bbox

        new_bbox_l = []
        
        for index, row in bbox_labeled.iterrows():
            new_bbox_l += split_line(row)
            
        new_bbox = pd.DataFrame(new_bbox_l, columns=bbox_labeled.columns)
        new_bbox[["x0", "y0", "x2", "y2"]] = new_bbox[["x0", "y0", "x2", "y2"]].astype(np.int16)
        
        del bbox_labeled

        for index, row in new_bbox.iterrows():
            label = row['label']

            if label != "O":
                entity_values = entities.iloc[0, entities.columns.get_loc(label.lower())]
                entity_set = entity_values.split()

                if any(SequenceMatcher(a=row['line'], b=b).ratio() > 0.7 for b in entity_set):
                    label = "S-" + label
                else:
                    label = "O"

            new_bbox.at[index, 'label'] = label

        width, height = image.size
        data.append([new_bbox, width, height])

    return data

def normalize(points: list, width: int, height: int) -> list:
    x0, y0, x2, y2 = [int(p) for p in points]

    x0 = int(1000 * (x0 / width))
    x2 = int(1000 * (x2 / width))
    y0 = int(1000 * (y0 / height))
    y2 = int(1000 * (y2 / height))

    return [x0, y0, x2, y2]

def write_dataset(dataset: list, output_dir: Path, name: str):
    print(f"Writing {name}ing dataset:")
    
    with open(output_dir / f"{name}.txt", "w+", encoding="utf8") as file, \
        open(output_dir / f"{name}_box.txt", "w+", encoding="utf8") as file_bbox, \
        open(output_dir / f"{name}_image.txt", "w+", encoding="utf8") as file_image:

        for datas in tqdm(dataset, total=len(dataset)):
            data, width, height = datas

            filename = data.iloc[0, data.columns.get_loc('filename')]

            for index, row in data.iterrows():
                bbox = [int(p) for p in row[['x0', 'y0', 'x2', 'y2']]]
                normalized_bbox = normalize(bbox, width, height)

                file.write("{}\t{}\n".format(row['line'], row['label']))
                file_bbox.write("{}\t{} {} {} {}\n".format(row['line'], *normalized_bbox))
                file_image.write("{}\t{} {} {} {}\t{} {}\t{}\n".format(row['line'], *bbox, width, height, filename))

            file.write("\n")
            file_bbox.write("\n")
            file_image.write("\n")

In [95]:
bbox_file_path = example_annotation
print("== File content ==")
!head -n 5 "{bbox_file_path}"

bbox = read_bbox_and_words(path=bbox_file_path)
print("\n== Dataframe ==")
bbox.head(5)

== File content ==
72,25,326,25,326,64,72,64,TAN WOON YANN
50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
205,121,285,121,285,139,205,139,789417-W
110,144,383,144,383,163,110,163,NO.53 55,57 & 59, JALAN SAGU 18,
192,169,299,169,299,187,192,187,TAMAN DAYA,

== Dataframe ==


Unnamed: 0,filename,x0,y0,x2,y2,line
0,X00016469612,72,25,326,64,TAN WOON YANN
1,X00016469612,50,82,440,121,BOOK TA .K(TAMAN DAYA) SDN BND
2,X00016469612,205,121,285,139,789417-W
3,X00016469612,110,144,383,163,"NO.53 55,57 & 59, JALAN SAGU 18,"
4,X00016469612,192,169,299,187,"TAMAN DAYA,"


In [96]:
entities_file_path = example_entity
print("== File content ==")
!head "{entities_file_path}"

entities = read_entities(path=entities_file_path)
print("\n\n== Dataframe ==")
entities

== File content ==
{
    "company": "BOOK TA .K (TAMAN DAYA) SDN BHD",
    "date": "25/12/2018",
    "address": "NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR.",
    "total": "9.00"
}

== Dataframe ==


Unnamed: 0,company,date,address,total
0,BOOK TA .K (TAMAN DAYA) SDN BHD,25/12/2018,"NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 8...",9.0


In [97]:
line = bbox.loc[1,"line"]
label = assign_line_label(line, entities)
print("Line:", line)
print("Assigned label:", label)

Line: BOOK TA .K(TAMAN DAYA) SDN BND
Assigned label: O


In [98]:
bbox_labeled = assign_labels(bbox, entities)
bbox_labeled.head(15)

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X00016469612,72,25,326,64,TAN WOON YANN,O
1,X00016469612,50,82,440,121,BOOK TA .K(TAMAN DAYA) SDN BND,O
2,X00016469612,205,121,285,139,789417-W,O
3,X00016469612,110,144,383,163,"NO.53 55,57 & 59, JALAN SAGU 18,",ADDRESS
4,X00016469612,192,169,299,187,"TAMAN DAYA,",COMPANY
5,X00016469612,162,193,334,211,"81100 JOHOR BAHRU,",ADDRESS
6,X00016469612,217,216,275,233,JOHOR.,ADDRESS
7,X00016469612,50,342,279,359,DOCUMENT NO : TD01167104,O
8,X00016469612,50,372,96,390,DATE:,O
9,X00016469612,165,372,342,389,25/12/2018 8:13:39 PM,DATE


In [99]:
new_lines = split_line(bbox_labeled.loc[1])
print("Original row:")
display(bbox_labeled.loc[1:1,:])

print("Splitted row:")
pd.DataFrame(new_lines, columns=bbox_labeled.columns)

Original row:


Unnamed: 0,filename,x0,y0,x2,y2,line,label
1,X00016469612,50,82,440,121,BOOK TA .K(TAMAN DAYA) SDN BND,O


Splitted row:


Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X00016469612,50,82,102,121,BOOK,O
1,X00016469612,107,82,133,121,TA,O
2,X00016469612,138,82,242,121,.K(TAMAN,O
3,X00016469612,247,82,312,121,DAYA),O
4,X00016469612,317,82,356,121,SDN,O
5,X00016469612,361,82,400,121,BND,O


In [100]:
dataset_train = dataset_creator(SROIE_FOLDER_PATH / "training_data")
dataset_test = dataset_creator(SROIE_FOLDER_PATH / "testing_data")

Reading dataset:


100%|█████████████████████████████████████████| 626/626 [00:36<00:00, 17.09it/s]


Reading dataset:


100%|█████████████████████████████████████████| 347/347 [00:19<00:00, 17.59it/s]


In [103]:
dataset_directory = Path("../data/SROIE/", "layoutlm_data")

dataset_directory.mkdir(parents=True, exist_ok=True)

write_dataset(dataset_train, dataset_directory, 'train')
write_dataset(dataset_test, dataset_directory, 'test')

labels = ['COMPANY', 'DATE', 'ADDRESS', 'TOTAL']
IOB_tags = ['S']

with open(dataset_directory / 'labels.txt', 'w') as f:
    for tag in IOB_tags:
        for label in labels:
            f.write(f"{tag}-{label}\n")
            
    f.write("O")

Writing training dataset:


100%|█████████████████████████████████████████| 626/626 [00:26<00:00, 23.84it/s]


Writing testing dataset:


100%|█████████████████████████████████████████| 347/347 [00:14<00:00, 23.34it/s]


## Install LayoutLM

In [None]:
%%bash
git clone https://github.com/microsoft/unilm.git
cd unilm/layoutlm/deprecated
pip install .

## Train LayoutLM

In [5]:
# Move to the script directory
os.chdir("/home/faranio/Desktop/EDISS/Thesis/implementation/notebooks/unilm/layoutlm/deprecated/examples/seq_labeling")

In [7]:
# Code for running training on CPU
! python run_seq_labeling.py \
                            --data_dir /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data \
                            --labels /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data/labels.txt \
                            --model_name_or_path /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm-base-uncased \
                            --model_type layoutlm \
                            --max_seq_length 512 \
                            --do_lower_case \
                            --do_train \
                            --num_train_epochs 10 \
                            --logging_steps 50 \
                            --save_steps -1 \
                            --output_dir output \
                            --overwrite_output_dir \
                            --per_gpu_train_batch_size 2 \
                            --per_gpu_eval_batch_size 2 \
                            --no_cuda

Traceback (most recent call last):
  File "run_seq_labeling.py", line 28, in <module>
    import torch
ImportError: No module named torch


In [None]:
# Code for running training on GPU
! python run_seq_labeling.py \
                            --data_dir /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data \
                            --labels /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data/labels.txt \
                            --model_name_or_path /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm-base-uncased \
                            --model_type layoutlm \
                            --max_seq_length 512 \
                            --do_lower_case \
                            --do_train \
                            --num_train_epochs 10 \
                            --logging_steps 50 \
                            --save_steps -1 \
                            --output_dir output \
                            --overwrite_output_dir \
                            --per_gpu_train_batch_size 2 \
                            --per_gpu_eval_batch_size 2

## Evaluate LayoutLM

In [None]:
# Evaluate for test set and make predictions on CPU
! python run_seq_labeling.py \
                            --data_dir /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data \
                            --labels /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data/labels.txt \
                            --model_name_or_path /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm-base-uncased \
                            --model_type layoutlm \
                            --do_lower_case \
                            --max_seq_length 512 \
                            --do_predict \
                            --logging_steps 10 \
                            --save_steps -1 \
                            --output_dir output \
                            --per_gpu_eval_batch_size 2 \
                            --no_cuda

In [None]:
# # Evaluate for test set and make predictions on GPU
! python run_seq_labeling.py \
                            --data_dir /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data \
                            --labels /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm_data/labels.txt \
                            --model_name_or_path /home/faranio/Desktop/EDISS/Thesis/implementation/data/SROIE/layoutlm-base-uncased \
                            --model_type layoutlm \
                            --do_lower_case \
                            --max_seq_length 512 \
                            --do_predict \
                            --logging_steps 10 \
                            --save_steps -1 \
                            --output_dir output \
                            --per_gpu_eval_batch_size 1

<h3>Results</h3>

**F1-Score**: 0.9545976069496803<br>
**Loss**: 0.10252952879456916<br>
**Precision**: 0.9436163318211277<br>
**Recall**: 0.965837479270315

In [None]:
cat output/test_results.txt

## Visualize