In [1]:
import os
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
from PIL import Image
from pprint import pprint

In [2]:
class Line:
    
    def __init__(self, pt1, pt2):
        self.pt1 = pt1
        self.pt2 = pt2
        self.__split_points()
        self.__compute_hvType()
        
    def __split_points(self):
        self.x1, self.y1 = self.pt1
        self.x2, self.y2 = self.pt2
    
    def __compute_hvType(self):
        x_dist, y_dist = abs(self.x2 - self.x1), abs(self.y2 - self.y1)
        self.hv_type = 'v' if x_dist < y_dist else 'h'
        
    def to_list(self):
        return self.x1, self.y1, self.x2, self.y2
    
        
def parse_object(obj):
    (x1, y1), (x2, y2) = obj['points']['exterior']
    return x1, y1, x2, y2
    
    
def load_json(fpath):
    lines = []
    objs = json.load(open(fpath, "r"))['objects']
    for obj in objs:
        if obj['geometryType'] == "line":
            x1, y1, x2, y2 = parse_object(obj)
            lines.append(Line(pt1=(x1, y1), pt2=(x2, y2)))
    return lines

In [3]:
def compute_leftmost_line(lines):
    vlines = [line for line in lines if line.hv_type == 'v']
    min_x_list = [min(line.x1, line.x2) for line in vlines if line]
    left_vline  = [line for line in vlines if min(line.x1, line.x2) == min(min_x_list)][0]
    return left_vline


def compute_rightmost_line(lines):
    vlines = [line for line in lines if line.hv_type == 'v']
    max_x_list = [max(line.x1, line.x2) for line in vlines]
    right_vline = [line for line in vlines if max(line.x1, line.x2) == max(max_x_list)][0]
    return right_vline


def compute_topmost_line(lines):
    hlines = [line for line in lines if line.hv_type == 'h']
    min_y_list = [min(line.y1, line.y2) for line in hlines]
    top_hline = [line for line in hlines if min(line.y1, line.y2) == min(min_y_list)][0]
    return top_hline
    
    
def compute_bottomost_line(lines):
    hlines = [line for line in lines if line.hv_type == 'h']
    max_y_list = [max(line.y1, line.y2) for line in hlines]
    bottom_hline = [line for line in hlines if max(line.y1, line.y2) == max(max_y_list)][0]
    return bottom_hline
    
    
def compute_table_mask(img, lines):    
    left_vline   = compute_leftmost_line(lines)
    right_vline  = compute_rightmost_line(lines)
    top_hline    = compute_topmost_line(lines)
    bottom_hline = compute_bottomost_line(lines)   
    
    x1 = min(left_vline.x1, left_vline.x2, top_hline.x1, top_hline.x2, bottom_hline.x1, bottom_hline.x2)
    y1 = min(left_vline.y1, left_vline.y2, top_hline.y1, top_hline.y2, right_vline.y1, right_vline.y2)
    x2 = max(right_vline.x1, right_vline.x2)
    y2 = max(left_vline.y1, left_vline.y2, right_vline.y1, right_vline.y2)
    
    mask = np.zeros((img.size[1], img.size[0])).astype(int)
    mask[y1: y2 + 1, x1: x2 + 1] = 1
    return mask


def compute_columns_mask(img, lines, boundary=5):
    vlines = [line for line in lines if line.hv_type == 'v']
    left_vline  = compute_leftmost_line(lines)
    right_vline = compute_rightmost_line(lines)
    top_hline    = compute_topmost_line(lines)
    bottom_hline = compute_bottomost_line(lines)   
    
    min_x_list = [min(line.x1, line.x2) for line in vlines]    
    vlines = [line for _, line in sorted(zip(min_x_list, vlines), key=lambda pair: pair[0])]
    
    mask = np.zeros((img.size[1], img.size[0])).astype(int)
    for left_line, right_line in zip(vlines[:-1], vlines[1:]):
        x1 = min(left_line.x1, left_line.x2) + boundary
        y1 = min(top_hline.y1, top_hline.y2, left_line.y1, left_line.y2, right_line.y1, right_line.y2)
        x2 = max(right_line.x1, right_line.x2) - boundary
        y2 = max(top_hline.y1, top_hline.y2, left_line.y1, left_line.y2, right_line.y1, right_line.y2)
        mask[y1: y2, x1: x2] = 1
        copy_img = np.asarray(img).copy()
        copy_img[mask == 1] = 0
    
    return mask

In [4]:
ANN_DIR = "../annotations/Batch1/ds0/ann/"
IMG_DIR = "../../download_from_drive/data/ProcessedO7/"
TAB_MASK_DIR = "./masks/table/"
COL_MASK_DIR = "./masks/columns/"

for d in [TAB_MASK_DIR, COL_MASK_DIR]:
    os.makedirs(d, exist_ok=True)
    
ann_fnames = [f for f in os.listdir(ANN_DIR) if f.endswith("json")]
for ann_fn in tqdm(ann_fnames):
    
    ann_path = os.path.join(ANN_DIR, ann_fn)
    img_path = os.path.join(IMG_DIR, ann_fn.split('.json')[0])
    tab_mask_path = os.path.join(TAB_MASK_DIR, ann_fn.split('.json')[0])
    col_mask_path = os.path.join(COL_MASK_DIR, ann_fn.split('.json')[0])
    
    lines = load_json(ann_path)
    if len(lines) > 0:
        img = Image.open(img_path)
        
        tab_mask = compute_table_mask(img=img, lines=lines)
        col_mask = compute_columns_mask(img=img, lines=lines)

        for mask_path, mask in zip([tab_mask_path, col_mask_path], [tab_mask, col_mask]):
            Image.fromarray((mask * 255).astype(np.uint8)).save(mask_path)
    else:
        print("No lines for " + ann_fn)

HBox(children=(FloatProgress(value=0.0, max=253.0), HTML(value='')))

No lines for 42360_O7_0576.jpg.json
No lines for 70386_O7_0464.jpg.json

