## Import packages

In [1]:
import re
from pathlib import Path

import cv2
import numpy as np
import pandas as pd
import torch
from PIL import Image
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from img2vec_pytorch import Img2Vec
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Define constants

In [3]:
# Define paths to SROIE dataset files
TRAIN_IMAGES_PATH = Path("../data/SROIE/training_data/images")
TRAIN_LABELS_PATH = Path("../data/SROIE/layoutlm_data/train.txt")
TRAIN_TEXTS_PATH = Path("../data/SROIE/layoutlm_data/train_image.txt")

TEST_IMAGES_PATH = Path("../data/SROIE/testing_data/images")
TEST_LABELS_PATH = Path("../data/SROIE/layoutlm_data/test.txt")
TEST_TEXTS_PATH = Path("../data/SROIE/layoutlm_data/test_image.txt")

## Prepare data

In [4]:
def read_texts_file(df, texts_path, images_path):
    with open(texts_path, 'r') as f:
        while True:
            line = f.readline()

            if not line:
                break

            line = line.replace('\n', '')

            if len(line) == 0:
                continue

            text, bbox, dims, image = line.split('\t')
            image_path = images_path /(image + ".jpg")
            
            bbox = [int(x) for x in bbox.split()]
            x1, y1, x2, y2 = bbox
            
            dims = dims.split()

            df["image_path"].append(image_path)
            df["text"].append(text)
            df["width"].append(int(dims[0]))
            df["height"].append(int(dims[1]))
            
            df["x1"].append(x1)
            df["y1"].append(y1)
            df["x2"].append(x2)
            df["y2"].append(y2)
            
    return df

def read_labels_file(df, labels_path):
    with open(labels_path, 'r') as f:
        while True:
            line = f.readline()

            if not line:
                break

            line = line.replace('\n', '')

            if len(line) == 0:
                continue

            _, label = line.split('\t')
            df["label"].append(label)
            
    return df

def read_dataset(texts_path, labels_path, images_path):
    df = {
        "label": [],
        "text": [], 
        "x1": [],
        "y1": [],
        "x2": [],
        "y2": [],
        "width": [],
        "height": [],
        "image_path": []
    }
    
    df = read_texts_file(df, texts_path, images_path)
    df = read_labels_file(df, labels_path)

    df = pd.DataFrame(df)
    return df

In [5]:
train_df = read_dataset(TRAIN_TEXTS_PATH, TRAIN_LABELS_PATH, TRAIN_IMAGES_PATH)
train_df = train_df[~((train_df["x2"] - train_df["x1"]) <= 0)]
train_df

Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path
0,O,TAN,72,25,130,64,463,1013,../data/SROIE/training_data/images/X0001646961...
1,O,WOON,135,25,213,64,463,1013,../data/SROIE/training_data/images/X0001646961...
2,O,YANN,218,25,296,64,463,1013,../data/SROIE/training_data/images/X0001646961...
3,O,BOOK,50,82,102,121,463,1013,../data/SROIE/training_data/images/X0001646961...
4,O,TA,107,82,133,121,463,1013,../data/SROIE/training_data/images/X0001646961...
...,...,...,...,...,...,...,...,...,...
72385,O,ANY,189,837,214,853,619,875,../data/SROIE/training_data/images/X5100945380...
72386,O,"ENQUIRY,",219,837,288,853,619,875,../data/SROIE/training_data/images/X5100945380...
72387,O,PLEASE,293,837,344,853,619,875,../data/SROIE/training_data/images/X5100945380...
72388,O,CONTACT,349,837,409,853,619,875,../data/SROIE/training_data/images/X5100945380...


In [6]:
test_df = read_dataset(TEST_TEXTS_PATH, TEST_LABELS_PATH, TEST_IMAGES_PATH)
test_df = test_df[~((test_df["x2"] - test_df["x1"]) <= 0)]
test_df

Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path
0,O,TAN,98,26,153,66,463,894,../data/SROIE/testing_data/images/X00016469670...
1,O,CHAY,158,26,232,66,463,894,../data/SROIE/testing_data/images/X00016469670...
2,O,YEE,237,26,292,66,463,894,../data/SROIE/testing_data/images/X00016469670...
3,O,***,138,95,173,120,463,894,../data/SROIE/testing_data/images/X00016469670...
4,O,COPY,178,95,225,120,463,894,../data/SROIE/testing_data/images/X00016469670...
...,...,...,...,...,...,...,...,...,...
39677,O,FOR,405,1589,443,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...
39678,O,REFUND,448,1589,525,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...
39679,O,OR,530,1589,555,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...
39680,O,EXCHANGE,560,1589,663,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...


## Extract embedding vectors

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def extract_text_embedding(text: str, model):
    embedding = model.encode([text], device=device)
    embedding = embedding[0]
    embedding = torch.FloatTensor(embedding)
    return embedding

In [9]:
def extract_image_embedding(image, model):
    PIL_image = Image.fromarray(np.uint8(image))
    embedding = model.get_vec(PIL_image, tensor=True)
    embedding = embedding.squeeze()
    return embedding

In [10]:
# Define pre-trained image encoder
image_model = Img2Vec(cuda=True)



In [11]:
# Define pre-trained text encoder
# text_model = SentenceTransformer("bert-base-nli-mean-tokens", device=device)
text_model = SentenceTransformer("all-mpnet-base-v2", device=device)
# text_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

In [12]:
def add_text_frequencies(df):
    texts_df = []

    for i, row in tqdm(df.iterrows()):
        text_length = len(row["text"])
        alpha_count = len(re.findall(r"[A-Za-z]", row["text"]))
        num_count = len(re.findall(r"[0-9]", row["text"]))

        text_frequencies = {"text_length": text_length, "alpha_count": alpha_count, "num_count": num_count}
        texts_df.append(text_frequencies)

    texts_df = pd.DataFrame.from_dict(texts_df)
    texts_df.index = df.index
    df = pd.concat([df, texts_df], axis=1)
    return df

In [13]:
def add_text_embeddings(df, model):
    texts_df = []

    for i, row in tqdm(df.iterrows()):
        text_embedding = extract_text_embedding(row["text"], model)
        text_embedding = {k: tensor.item() for k, tensor in enumerate(text_embedding)}
        texts_df.append(text_embedding)
        
    texts_df = pd.DataFrame.from_dict(texts_df)
    texts_df.columns = [f"text_{x}" for x in texts_df.columns]
    texts_df.index = df.index
    df.drop(columns=["text"], inplace=True)
    df = pd.concat([df, texts_df], axis=1)
    return df

In [14]:
def add_image_embeddings(df, model, buffer=50):
    images_df = []
    
    for i, row in tqdm(df.iterrows()):
        x1 = row["x1"]
        y1 = row["y1"]
        x2 = row["x2"]
        y2 = row["y2"]
        
        try:            
            image = cv2.imread(str(row["image_path"]))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            field_image = image[
                max(0, y1 - buffer):min(row["height"], y2 + buffer),
                max(0, x1 - buffer):min(row["width"], x2 + buffer),
                :
            ]
            
            image_embedding = extract_image_embedding(field_image, model)
            image_embedding = {k: tensor.item() for k, tensor in enumerate(image_embedding)}
            images_df.append(image_embedding)
        except Exception as e:
            print("Exception:", e)
            print(field_image.shape, image.shape, row)
            break
        
    images_df = pd.DataFrame.from_dict(images_df)
    images_df.columns = [f"image_{x}" for x in images_df.columns]
    images_df.index = df.index
    df = pd.concat([df, images_df], axis=1)
    return df

In [15]:
train_df = add_text_frequencies(train_df)
train_df

72389it [00:03, 18842.93it/s]


Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path,text_length,alpha_count,num_count
0,O,TAN,72,25,130,64,463,1013,../data/SROIE/training_data/images/X0001646961...,3,3,0
1,O,WOON,135,25,213,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,0
2,O,YANN,218,25,296,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,0
3,O,BOOK,50,82,102,121,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,0
4,O,TA,107,82,133,121,463,1013,../data/SROIE/training_data/images/X0001646961...,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
72385,O,ANY,189,837,214,853,619,875,../data/SROIE/training_data/images/X5100945380...,3,3,0
72386,O,"ENQUIRY,",219,837,288,853,619,875,../data/SROIE/training_data/images/X5100945380...,8,7,0
72387,O,PLEASE,293,837,344,853,619,875,../data/SROIE/training_data/images/X5100945380...,6,6,0
72388,O,CONTACT,349,837,409,853,619,875,../data/SROIE/training_data/images/X5100945380...,7,7,0


In [16]:
test_df = add_text_frequencies(test_df)
test_df

39680it [00:02, 18316.30it/s]


Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path,text_length,alpha_count,num_count
0,O,TAN,98,26,153,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,3,0
1,O,CHAY,158,26,232,66,463,894,../data/SROIE/testing_data/images/X00016469670...,4,4,0
2,O,YEE,237,26,292,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,3,0
3,O,***,138,95,173,120,463,894,../data/SROIE/testing_data/images/X00016469670...,3,0,0
4,O,COPY,178,95,225,120,463,894,../data/SROIE/testing_data/images/X00016469670...,4,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
39677,O,FOR,405,1589,443,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,3,3,0
39678,O,REFUND,448,1589,525,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,6,6,0
39679,O,OR,530,1589,555,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,2,2,0
39680,O,EXCHANGE,560,1589,663,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,8,8,0


In [17]:
train_df = add_image_embeddings(train_df, image_model)
train_df

50085it [35:40,  3.41it/s]Corrupt JPEG data: bad Huffman code
50086it [35:40,  3.34it/s]Corrupt JPEG data: bad Huffman code
50087it [35:40,  3.30it/s]Corrupt JPEG data: bad Huffman code
50088it [35:41,  3.20it/s]Corrupt JPEG data: bad Huffman code
50089it [35:41,  3.22it/s]Corrupt JPEG data: bad Huffman code
50090it [35:41,  3.23it/s]Corrupt JPEG data: bad Huffman code
50091it [35:42,  3.25it/s]Corrupt JPEG data: bad Huffman code
50092it [35:42,  3.26it/s]Corrupt JPEG data: bad Huffman code
50093it [35:42,  3.27it/s]Corrupt JPEG data: bad Huffman code
50094it [35:43,  3.28it/s]Corrupt JPEG data: bad Huffman code
50095it [35:43,  3.29it/s]Corrupt JPEG data: bad Huffman code
50096it [35:43,  3.31it/s]Corrupt JPEG data: bad Huffman code
50097it [35:43,  3.30it/s]Corrupt JPEG data: bad Huffman code
50098it [35:44,  3.29it/s]Corrupt JPEG data: bad Huffman code
50099it [35:44,  3.32it/s]Corrupt JPEG data: bad Huffman code
50100it [35:44,  3.37it/s]Corrupt JPEG data: bad Huffman code
50101it 

Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path,text_length,...,image_502,image_503,image_504,image_505,image_506,image_507,image_508,image_509,image_510,image_511
0,O,TAN,72,25,130,64,463,1013,../data/SROIE/training_data/images/X0001646961...,3,...,0.020048,0.058405,0.000000,0.366231,0.000000,0.000000,0.144254,0.029488,0.000000,0.162789
1,O,WOON,135,25,213,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,...,0.000000,0.122239,0.000000,0.246217,0.000000,0.000000,0.104884,0.023457,0.008069,0.315111
2,O,YANN,218,25,296,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,...,0.000000,0.234890,0.000000,0.550147,0.001890,0.000000,0.024377,0.025778,0.000000,0.129966
3,O,BOOK,50,82,102,121,463,1013,../data/SROIE/training_data/images/X0001646961...,4,...,0.009856,0.045854,0.004641,0.430816,0.003953,0.217625,0.127876,0.429837,0.031099,0.034185
4,O,TA,107,82,133,121,463,1013,../data/SROIE/training_data/images/X0001646961...,2,...,0.000000,0.000000,0.000000,0.462654,0.000000,0.109754,0.079724,0.000000,0.014735,0.122793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72385,O,ANY,189,837,214,853,619,875,../data/SROIE/training_data/images/X5100945380...,3,...,0.000000,0.070859,0.000000,0.434639,0.000000,0.115011,0.010761,0.029904,0.013856,0.070069
72386,O,"ENQUIRY,",219,837,288,853,619,875,../data/SROIE/training_data/images/X5100945380...,8,...,0.000000,0.018750,0.000000,0.316937,0.005975,0.290037,0.091563,0.018003,0.000000,0.245385
72387,O,PLEASE,293,837,344,853,619,875,../data/SROIE/training_data/images/X5100945380...,6,...,0.000000,0.013719,0.000000,0.257553,0.063152,0.188667,0.012579,0.040953,0.000000,0.130505
72388,O,CONTACT,349,837,409,853,619,875,../data/SROIE/training_data/images/X5100945380...,7,...,0.000000,0.011736,0.000000,0.421066,0.069317,0.199088,0.026314,0.017306,0.005798,0.163273


In [18]:
train_df = add_text_embeddings(train_df, text_model)
train_df

72389it [13:10, 91.56it/s] 


Unnamed: 0,label,x1,y1,x2,y2,width,height,image_path,text_length,alpha_count,...,text_758,text_759,text_760,text_761,text_762,text_763,text_764,text_765,text_766,text_767
0,O,72,25,130,64,463,1013,../data/SROIE/training_data/images/X0001646961...,3,3,...,-0.043179,0.011357,-0.025010,-0.020825,-0.042661,0.036733,-0.039157,0.024713,-0.051543,-0.008242
1,O,135,25,213,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,...,0.045743,-0.006533,0.063699,0.007715,-0.005969,0.054672,0.054480,0.062522,-0.044647,-0.013282
2,O,218,25,296,64,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,...,0.009353,0.001297,-0.008407,-0.009319,-0.012464,0.026040,0.032243,0.006994,-0.057619,-0.036633
3,O,50,82,102,121,463,1013,../data/SROIE/training_data/images/X0001646961...,4,4,...,-0.103987,-0.039189,0.010518,-0.001245,-0.011138,0.062873,-0.002950,0.015333,-0.052296,-0.024258
4,O,107,82,133,121,463,1013,../data/SROIE/training_data/images/X0001646961...,2,2,...,-0.040041,0.044875,0.008178,-0.028312,0.003375,0.026132,-0.011436,-0.009047,-0.012535,0.002621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72385,O,189,837,214,853,619,875,../data/SROIE/training_data/images/X5100945380...,3,3,...,0.006633,0.017989,0.019470,-0.001860,-0.035593,0.052716,-0.001316,0.018626,-0.044441,0.005682
72386,O,219,837,288,853,619,875,../data/SROIE/training_data/images/X5100945380...,8,7,...,0.012693,-0.022961,0.059991,-0.006271,0.003093,-0.037735,-0.019850,0.058535,0.008770,0.022487
72387,O,293,837,344,853,619,875,../data/SROIE/training_data/images/X5100945380...,6,6,...,0.025878,0.078342,0.057747,-0.000286,-0.006738,-0.022748,-0.016694,-0.026390,0.023161,-0.031222
72388,O,349,837,409,853,619,875,../data/SROIE/training_data/images/X5100945380...,7,7,...,0.002591,0.039431,0.026870,0.021478,-0.019944,0.003892,0.028598,0.009909,-0.035573,0.002492


In [19]:
test_df = add_image_embeddings(test_df, image_model)
test_df

39680it [32:33, 20.31it/s]


Unnamed: 0,label,text,x1,y1,x2,y2,width,height,image_path,text_length,...,image_502,image_503,image_504,image_505,image_506,image_507,image_508,image_509,image_510,image_511
0,O,TAN,98,26,153,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,...,0.197722,0.206716,0.000000,0.404694,0.168599,0.001479,0.192561,0.965026,0.000000,0.062427
1,O,CHAY,158,26,232,66,463,894,../data/SROIE/testing_data/images/X00016469670...,4,...,0.104390,0.222462,0.000000,0.321368,0.176074,0.001770,0.169338,0.353613,0.000000,0.179749
2,O,YEE,237,26,292,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,...,0.458298,0.329260,0.000000,0.260611,0.024571,0.000000,0.117561,0.304855,0.000278,0.116308
3,O,***,138,95,173,120,463,894,../data/SROIE/testing_data/images/X00016469670...,3,...,0.111417,0.277127,0.000000,0.509893,0.111997,0.092320,0.040353,0.171841,0.000000,0.256610
4,O,COPY,178,95,225,120,463,894,../data/SROIE/testing_data/images/X00016469670...,4,...,0.102081,0.449358,0.003155,0.497956,0.228424,0.338493,0.048686,0.016538,0.000000,0.800893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39677,O,FOR,405,1589,443,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,3,...,0.668845,0.000000,0.156975,0.510105,0.520538,0.774475,0.146331,0.246380,0.004950,0.025807
39678,O,REFUND,448,1589,525,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,6,...,0.960936,0.000622,0.045097,0.624159,0.221650,0.051847,0.166374,0.140362,0.006448,0.093981
39679,O,OR,530,1589,555,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,2,...,0.468676,0.001010,0.684789,0.750384,1.070243,1.068292,0.029981,0.199615,0.069937,0.020183
39680,O,EXCHANGE,560,1589,663,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,8,...,0.689200,0.000000,0.149236,0.980791,0.554215,0.117261,0.102292,0.533822,0.003840,0.106194


In [20]:
test_df = add_text_embeddings(test_df, text_model)
test_df

39680it [06:36, 99.97it/s] 


Unnamed: 0,label,x1,y1,x2,y2,width,height,image_path,text_length,alpha_count,...,text_758,text_759,text_760,text_761,text_762,text_763,text_764,text_765,text_766,text_767
0,O,98,26,153,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,3,...,-0.043179,0.011357,-0.025010,-0.020825,-0.042661,0.036733,-0.039157,0.024713,-0.051543,-0.008242
1,O,158,26,232,66,463,894,../data/SROIE/testing_data/images/X00016469670...,4,4,...,-0.013091,-0.005370,0.031561,-0.010384,0.005535,0.030147,-0.026708,0.090474,-0.073575,-0.002189
2,O,237,26,292,66,463,894,../data/SROIE/testing_data/images/X00016469670...,3,3,...,0.037081,-0.008285,0.069396,0.009720,0.003281,0.011498,0.014066,0.052029,-0.011903,-0.025509
3,O,138,95,173,120,463,894,../data/SROIE/testing_data/images/X00016469670...,3,0,...,-0.002816,0.037223,0.064261,-0.003410,0.010620,0.033925,-0.028956,0.024285,0.016989,-0.022417
4,O,178,95,225,120,463,894,../data/SROIE/testing_data/images/X00016469670...,4,4,...,-0.013525,0.054217,0.012378,-0.007393,-0.006583,0.043897,-0.016967,-0.009654,0.039707,-0.039091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39677,O,405,1589,443,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,3,3,...,-0.021914,0.045012,-0.028122,0.016033,-0.011445,0.034254,-0.036234,-0.016313,-0.038186,-0.005211
39678,O,448,1589,525,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,6,6,...,-0.079042,0.002829,-0.026024,0.009414,-0.002522,0.032373,0.028129,-0.025444,0.025688,0.019491
39679,O,530,1589,555,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,2,2,...,0.038565,0.002977,-0.001757,0.006616,0.002489,0.055948,-0.003405,-0.017903,-0.108177,-0.007096
39680,O,560,1589,663,1623,884,1678,../data/SROIE/testing_data/images/X51009568881...,8,8,...,-0.046166,-0.016412,0.072695,0.019480,-0.016180,0.047412,-0.028888,-0.005228,0.011755,-0.033923


## Save datasets

In [21]:
# Save resultant dataframe files
train_df.to_csv("../data/SROIE/mpnet_train_image_50_frequencies.csv", index=False)
test_df.to_csv("../data/SROIE/mpnet_test_image_50_frequencies.csv", index=False)