In [107]:
import cv2
import csv
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

read the csv after finding horizontal threshold, vertical thresholds. make it such that it follows the given format

final csv is the csv that has columns Id, Top-Left, Bottom-Right, Top, Left, Bottom, Right, Top_box, Bottom_box, Left_box, Right_box

Id - refers to box ID -> int

Top-Left,Bottom-Right - refers to the xy coordinates of topleft and bottomright coordinate of a bounding box

Top/Left/Bottom/Right - refers to the coordinates of edge centers

Top_box/Bottom_box/Left_box/Right_box - refers to the nearest box from current box such that the words across paragraphs or columns should not get connected -> List:[dist, nearest box ID]


In [108]:
def get_values(df):
    columns_idx = list(df.iloc[:,3])
    top_left = []
    for vec in list(df.iloc[:,1]):
        vec = vec.replace('[', '').replace(']', '').replace(',', '')
        num_list = vec.split()
        numbers = [int(num) for num in num_list]
        top_left.append(numbers)
    bottom_right = []
    for vec in list(df.iloc[:,2]):
        vec = vec.replace('[', '').replace(']', '').replace(',', '')
        num_list = vec.split()
        numbers = [int(num) for num in num_list]
        bottom_right.append(numbers)
    return columns_idx, top_left, bottom_right

def get_edge_centres(top_left, bottom_right):
    X = []
    top = []
    left = []
    bottom = []
    right = []
    for vec1, vec2 in zip(top_left, bottom_right):
        avx = (vec1[0]+vec2[0]) / 2
        avy = (vec1[1]+vec2[1]) / 2
        X.append([avx, avy])
        top.append([avx, vec1[1]])
        left.append([vec1[0], avy])
        bottom.append([avx, vec2[1]])
        right.append([vec2[0], avy])
    return X, top, left, bottom, right

def euclidean(a, b):
    return np.sqrt(np.sum((np.array(a)-np.array(b))**2))

def get_neighbours(arr, arr2, cluster):
    result = []
    for i, vec1 in enumerate(arr):
        dist, idx = 1e6, 0
        for j, vec2 in enumerate(arr2):
            if cluster[i] == cluster[j]:
                new_dist = euclidean(vec1, vec2)
                if new_dist < dist:
                    dist = new_dist
                    idx = j
        result.append([round(dist, 2), idx])
    return result

In [109]:
df = pd.read_csv('CSV/TLBR_159.csv')
columns_id, top_left, bottom_right = get_values(df)
X, top, left, bottom, right = get_edge_centres(top_left, bottom_right)

agglomerative = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
agglomerative.fit(X)
clusters = agglomerative.labels_

top_box = get_neighbours(top, bottom, clusters)
bottom_box = get_neighbours(bottom, top, clusters)
left_box = get_neighbours(left, right, clusters)
right_box = get_neighbours(right, left, clusters)

data = list(zip(columns_id, top_left, bottom_right, top, left, bottom, right, top_box, bottom_box, left_box, right_box))
header = ['columns Id', 'Top-Left', 'Bottom-Right', 'Top', 'Left', 'Bottom', 'Right', 'Top_box', 'Bottom_box', 'Left_box', 'Right_box']

file = 'output.csv'
with open(file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(data)

In [110]:
img = cv2.imread('images/159.jpg')

Given below are two functions to visualize the document image and the connections established.

visualize_bboxes() can be used to visualize a document image with the bounding boxes around words. Bounding box information is given in the CSVs provided

make_connections() can be used to verify and visualize the connections established between words such that it satisfies the given criteria of grouping words in a logical paragraph. This function has to be used after getting the final csv in the specified format

In [111]:
def visualize_bboxes(img):
  img_2 = img.copy()
  for index, rows in df.iterrows():
    top_left = (int(rows['Top-Left'].split(',')[0][1:]),int(rows['Top-Left'].split(',')[1][:-1]))
    bottom_right = (int(rows['Bottom-Right'].split(',')[0][1:]),int(rows['Bottom-Right'].split(',')[1][:-1]))
    print(top_left)
    cv2.rectangle(img_2, top_left, bottom_right, (255, 0, 0), 2)

    plt.imshow(img_2)

In [112]:
def make_connections(image):

  image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

  image_with_boxes = image_rgb.copy()

  for index, row in euclidean.iterrows():
      left = int(row['Left'][0])
      right = int(row['Right'][0])
      top = int(row['Top'][1])
      bottom = int(row['Bottom'][1])
      box_id = int(row['Id'])

      width = right - left
      height = bottom - top

      top_left = (left, top)
      bottom_right = (right, bottom)

      cv2.rectangle(image_with_boxes, top_left, bottom_right, (255, 0, 0), 2)

      label_position = (left, top - 10)
      cv2.putText(image_with_boxes, str(box_id), label_position, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

      top_adjacent_id = int(row['Top_Box'][1])
      bottom_adjacent_id = int(row['Bottom_Box'][1])
      left_adjacent_id = int(row['Left_Box'][1])
      right_adjacent_id = int(row['Right_Box'][1])

      if top_adjacent_id != 0:
          top_adjacent_row = euclidean[euclidean['Id'] == top_adjacent_id].iloc[0]
          top_adjacent_center = int(top_adjacent_row['Bottom'][0]) , int(top_adjacent_row['Bottom'][1])
          cv2.line(image_with_boxes, (int(left) + width // 2, int(top)), top_adjacent_center, (0, 255, 0), 2)

      if bottom_adjacent_id != 0:
          bottom_adjacent_row = euclidean[euclidean['Id'] == bottom_adjacent_id].iloc[0]
          bottom_adjacent_center = int(bottom_adjacent_row['Top'][0]) , int(bottom_adjacent_row['Top'][1])
          cv2.line(image_with_boxes, (int(left) + width // 2, int(bottom)), (int(bottom_adjacent_center[0]), int(bottom_adjacent_center[1])), (0, 255, 0), 2)

      if left_adjacent_id != 0:
          left_adjacent_row = euclidean[euclidean['Id'] == left_adjacent_id].iloc[0]
          left_adjacent_center = int(left_adjacent_row['Right'][0]) , int(left_adjacent_row['Right'][1])
          cv2.line(image_with_boxes, (int(left), int(top) + height // 2), (int(left_adjacent_center[0]), int(left_adjacent_center[1])), (0, 255, 0), 2)

      if right_adjacent_id != 0:
          right_adjacent_row = euclidean[euclidean['Id'] == right_adjacent_id].iloc[0]
          right_adjacent_center = int(right_adjacent_row['Left'][0]) , int(right_adjacent_row['Left'][1])
          cv2.line(image_with_boxes, (int(right), int(top) + height // 2), (int(right_adjacent_center[0]), int(right_adjacent_center[1])), (0, 255, 0), 2)

  return image_with_boxes

In [113]:
# visualize_bboxes(img)
# make_connections(img)