In [107]:
import seaborn as sns
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from umap import UMAP 

from scipy.spatial import distance_matrix

import cv2

## Picture

In [108]:
num = 4

PIC_NAME = f'../pics/image{num}.png'
OUT_NAME = f'../out/image{num}_test.png'

# Step 0: Image preprocessing

## Stage 1: Creation binary grayscaled image

In [109]:
def black(img, threshold):
    img[img >= threshold] = 255
    img[img < threshold] = 0
    return img 

def inverse(img):
    return 255 - img

In [110]:
img = cv2.imread(PIC_NAME, 0)

original_img = img.copy()

img = inverse(img)
black(img, 20)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# Step 1: Word detection

## Stage 1: Tesseract character metainfo fetching 

In [111]:
import pytesseract

In [112]:
raws = pytesseract.image_to_boxes(cv2.imread(PIC_NAME)).split('\n')[:-1]
matrix = [[int(el) for el in raw.split(' ')[1:-1]] for raw in raws]

In [113]:
metainfo = (pd.DataFrame(np.array(matrix)).rename({
    0: 'x1',
    1: 'y1',
    2: 'x2',
    3: 'y2' 
    }, axis=1)
    .astype({
            'x1': int,
            'x2': int,
            'y1': int,
            'y2': int,
    }))

In [114]:
metainfo['x_centroid'] = metainfo['x1'] + (metainfo['x2'] - metainfo['x1'] /2)
metainfo['y_centroid'] = metainfo['y1'] + (metainfo['y2'] - metainfo['y1'] /2)
metainfo['zero'] = 0
metainfo['char_shape'] = abs(metainfo['y2'] - metainfo['y1']) * abs(metainfo['x2'] - metainfo['x1'])
metainfo['width'] = abs(metainfo['x1'] - metainfo['x2'])
metainfo['height'] = abs(metainfo['y1'] - metainfo['y2'])

In [115]:
width = metainfo.width.mean()
height = metainfo.height.mean()

area = width * height

print((width, height))

(6.6035928143712574, 8.221556886227544)


In [116]:
def paragraph_kernel_selection(width, height):
    mean = min(width, height)

    floor  = np.floor(mean)
    odd_floor = floor if floor % 2 == 1 else floor - 1

    ceil  = np.ceil(mean)
    odd_ceil = ceil if ceil % 2 == 1 else ceil + 1

    _kernel_dim =  np.uint8(odd_floor) if abs(mean - odd_floor) < abs(mean - odd_ceil) else np.uint8(odd_ceil)
    _kernel_dim = _kernel_dim - 2 if _kernel_dim >= 9 else _kernel_dim

    return np.ones((_kernel_dim, _kernel_dim), np.uint8)

## Stage 2: smoothing paragraphs

In [117]:
dst = cv2.filter2D(
    img,
    -1, 
    paragraph_kernel_selection(width, height)
)
dst = cv2.erode(dst, np.ones((3,3), np.uint8),iterations = 1)


black(dst, 200)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# Step 2: Create paragraph boxes

In [118]:
dataset = pd.DataFrame(dst).stack().reset_index().rename({'level_0': 'y', 'level_1': 'x', 0: 'brightness'}, axis=1)
dataset = dataset[dataset.brightness > 20]
dataset

Unnamed: 0,y,x,brightness
1845,7,39,255
1846,7,40,255
1847,7,41,255
1848,7,42,255
1849,7,43,255
...,...,...,...
130929,507,123,255
130930,507,124,255
130931,507,125,255
130932,507,126,255


In [119]:
epsilon = 2

clustered = dataset.copy()

clustering = DBSCAN(eps=epsilon, min_samples=10).fit(dataset.loc[:, ['y', 'x']])
clustered['cluster'] = clustering.labels_


In [120]:
raw_clusters = clustered.groupby('cluster').agg({'x': ['min', 'max'], 'y': ['min', 'max']}, axis=1)
raw_clusters = pd.concat([raw_clusters.x, raw_clusters.y], axis=1)
raw_clusters.columns = ['x1', 'y1', 'x2', 'y2']

In [121]:
len(raw_clusters)

11

In [122]:
dst = inverse(dst)

kernel_dim = 3

raw = 255*np.ones((original_img.shape), np.uint8)

for rect in raw_clusters.values:
    x1, y1, x2, y2 = rect
    cv2.rectangle(raw, (y1, y2), (x1, x2), 0, 1)

raw = inverse(raw)

# Step 3: Cluster pruning

## Stage 1: Unite close clusters

In [123]:
dataset = pd.DataFrame(raw).stack().reset_index().rename({'level_0': 'y', 'level_1': 'x', 0: 'brightness'}, axis=1)
dataset = dataset[dataset.brightness > 20]
dataset

Unnamed: 0,y,x,brightness
1808,7,2,255
1809,7,3,255
1810,7,4,255
1811,7,5,255
1812,7,6,255
...,...,...,...
130929,507,123,255
130930,507,124,255
130931,507,125,255
130932,507,126,255


Here:

[epsilon_2 = min(width, height) / 1.8] is a heuristic, manualy balanced inter cluster max range (our paragraphs won't have bigger distances within one cluster) 

In [124]:
clustered = dataset.copy()

epsilon_2 = min(width, height) / 1.8

clustering = DBSCAN(eps=epsilon_2, min_samples=4).fit(dataset.loc[:, ['y', 'x']])
clustered['cluster'] = clustering.labels_
print(clustered.cluster.nunique())

10


In [125]:
raw_clusters = clustered.groupby('cluster').agg({'x': ['min', 'max'], 'y': ['min', 'max']}, axis=1)
raw_clusters = pd.concat([raw_clusters.x, raw_clusters.y], axis=1)
raw_clusters.columns = ['x1', 'x2', 'y1', 'y2']

## Stage 2: Cluster inside cluster issue

In [126]:
raw_clusters['shape'] = abs(raw_clusters['x1'] - raw_clusters['x2']) * abs(raw_clusters['y1'] - raw_clusters['y2'])
raw_clusters.sort_values('shape', inplace=True, ascending=False)

Simple algo. Sort clusters by thier area. For each cluter if their mass center is available on matrix and then fill it's area 

In [127]:
cluster_list = []
mask = np.zeros(original_img.shape, np.uint8)

for cluster in raw_clusters.values:
    
    x_mass = (cluster[0] + cluster[1]) // 2
    y_mass = (cluster[2] + cluster[3]) // 2

    if mask[y_mass][x_mass] == 0:
        cluster_list.append(cluster[:-1])
        cv2.rectangle(mask, (cluster[1], cluster[3]), (cluster[0], cluster[2]), 255, -1)

In [128]:
final_clusters = pd.DataFrame(cluster_list).rename({0:'x1', 1:'x2', 2:'y1', 3:'y2'}, axis=1)

# Step 4: Add some order

In [129]:
final_clusters['id'] = final_clusters.index
final_clusters['x_mass'] = (final_clusters['x2'] + final_clusters['x1']) / 2
final_clusters['y_mass'] = (final_clusters['y2'] + final_clusters['y1']) / 2
final_clusters['P_polar'] = (final_clusters['x1'] ** 2) + (final_clusters['y1'] ** 2)

final_clusters['order'] = final_clusters.sort_values('P_polar').reset_index().drop('index', axis=1).index

In [130]:
final_clusters

Unnamed: 0,x1,x2,y1,y2,id,x_mass,y_mass,P_polar,order
0,2,123,294,500,0,62.5,397.0,86440,0
1,123,250,354,507,1,186.5,430.5,140445,1
2,2,122,7,164,2,62.0,85.5,53,2
3,130,250,79,212,3,190.0,145.5,23141,3
4,130,251,222,343,4,190.5,282.5,66184,4
5,2,122,175,283,5,62.0,229.0,30629,5
6,130,251,7,68,6,190.5,37.5,16949,6
7,7,14,504,507,7,10.5,505.5,254065,7
8,32,36,505,507,8,34.0,506.0,256049,8


In [131]:
started = True
x_prev, y_prev = None, None

for rect in final_clusters.sort_values('order').values:
    x1, y1, x2, y2 = rect[:4].astype(int)
    x_mass, y_mass = rect[5:7].astype(int)
    cv2.rectangle(original_img, (y1, y2), (x1, x2), 0, 1)


    # if not started:
    #     cv2.line(original_img, (x_mass, y_mass), (x_prev, y_prev), 0, 1)

    # x_prev = x_mass
    # y_prev = y_mass
    
    # started = False

cv2.imwrite(OUT_NAME, original_img)


0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0


True