In [2036]:
import seaborn as sns
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from umap import UMAP 

from scipy.spatial import distance_matrix

import cv2

## Picture

In [2037]:
num = 8

PIC_NAME = f'../pics/image{num}.png'
OUT_NAME = f'../out/image{num}_test.png'

# Step 0: Image preprocessing

## Stage 1: Creation binary grayscaled image

In [2038]:
def black(img, threshold):
    img[img >= threshold] = 255
    img[img < threshold] = 0
    return img 

def inverse(img):
    return 255 - img

In [2039]:
img = cv2.imread(PIC_NAME, 0)

original_img = img.copy()

img = inverse(img)
black(img, 20)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# Step 1: Word detection

## Stage 1: Tesseract character metainfo fetching 

In [2040]:
import pytesseract

In [2041]:
raws = pytesseract.image_to_boxes(cv2.imread(PIC_NAME)).split('\n')[:-1]
matrix = [[int(el) for el in raw.split(' ')[1:-1]] for raw in raws]

In [2042]:
metainfo = (pd.DataFrame(np.array(matrix)).rename({
    0: 'x1',
    1: 'y1',
    2: 'x2',
    3: 'y2' 
    }, axis=1)
    .astype({
            'x1': int,
            'x2': int,
            'y1': int,
            'y2': int,
    }))

In [2043]:
metainfo['x_centroid'] = metainfo['x1'] + (metainfo['x2'] - metainfo['x1'] /2)
metainfo['y_centroid'] = metainfo['y1'] + (metainfo['y2'] - metainfo['y1'] /2)
metainfo['zero'] = 0
metainfo['char_shape'] = abs(metainfo['y2'] - metainfo['y1']) * abs(metainfo['x2'] - metainfo['x1'])
metainfo['width'] = abs(metainfo['x1'] - metainfo['x2'])
metainfo['height'] = abs(metainfo['y1'] - metainfo['y2'])

In [2044]:
width = metainfo.width.mean()
height = metainfo.height.mean()

area = width * height

print((width, height))

(9.372661122661123, 12.18866943866944)


In [2045]:
def paragraph_kernel_selection(width, height):
    mean = min(width, height)

    floor  = np.floor(mean)
    odd_floor = floor if floor % 2 == 1 else floor - 1

    ceil  = np.ceil(mean)
    odd_ceil = ceil if ceil % 2 == 1 else ceil + 1

    _kernel_dim =  np.uint8(odd_floor) if abs(mean - odd_floor) < abs(mean - odd_ceil) else np.uint8(odd_ceil)
    _kernel_dim = _kernel_dim - 2 if _kernel_dim >= 9 else _kernel_dim

    return np.ones((_kernel_dim, _kernel_dim), np.uint8)

## Stage 2: smoothing paragraphs

In [2046]:
dst = cv2.filter2D(
    img,
    -1, 
    paragraph_kernel_selection(width, height)
)
dst = cv2.erode(dst, np.ones((3,3), np.uint8),iterations = 1)


black(dst, 200)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# Step 2: Create paragraph boxes

In [2047]:
dataset = pd.DataFrame(dst).stack().reset_index().rename({'level_0': 'y', 'level_1': 'x', 0: 'brightness'}, axis=1)
dataset = dataset[dataset.brightness > 20]
dataset

Unnamed: 0,y,x,brightness
3120,6,18,255
3121,6,19,255
3122,6,20,255
3123,6,21,255
3124,6,22,255
...,...,...,...
392426,759,23,255
392427,759,24,255
392428,759,25,255
392429,759,26,255


In [2048]:
epsilon = 2

clustered = dataset.copy()

clustering = DBSCAN(eps=epsilon, min_samples=10).fit(dataset.loc[:, ['y', 'x']])
clustered['cluster'] = clustering.labels_


In [2049]:
raw_clusters = clustered.groupby('cluster').agg({'x': ['min', 'max'], 'y': ['min', 'max']}, axis=1)
raw_clusters = pd.concat([raw_clusters.x, raw_clusters.y], axis=1)
raw_clusters.columns = ['x1', 'y1', 'x2', 'y2']

In [2050]:
len(raw_clusters)

36

In [2051]:
dst = inverse(dst)

kernel_dim = 3

raw = 255*np.ones((original_img.shape), np.uint8)

for rect in raw_clusters.values:
    x1, y1, x2, y2 = rect
    cv2.rectangle(raw, (y1, y2), (x1, x2), 0, 1)

raw = inverse(raw)

# Step 3: Cluster pruning

## Stage 1: Unite close clusters

In [2052]:
dataset = pd.DataFrame(raw).stack().reset_index().rename({'level_0': 'y', 'level_1': 'x', 0: 'brightness'}, axis=1)
dataset = dataset[dataset.brightness > 20]
dataset

Unnamed: 0,y,x,brightness
3119,6,17,255
3120,6,18,255
3121,6,19,255
3122,6,20,255
3123,6,21,255
...,...,...,...
392426,759,23,255
392427,759,24,255
392428,759,25,255
392429,759,26,255


Here:

[epsilon_2 = min(width, height) / 1.8] is a heuristic, manualy balanced inter cluster max range (our paragraphs won't have bigger distances within one cluster) 

In [2053]:
clustered = dataset.copy()

epsilon_2 = min(width, height) / 1.8

clustering = DBSCAN(eps=epsilon_2, min_samples=4).fit(dataset.loc[:, ['y', 'x']])
clustered['cluster'] = clustering.labels_
print(clustered.cluster.nunique())

32


In [2054]:
raw_clusters = clustered.groupby('cluster').agg({'x': ['min', 'max'], 'y': ['min', 'max']}, axis=1)
raw_clusters = pd.concat([raw_clusters.x, raw_clusters.y], axis=1)
raw_clusters.columns = ['x1', 'x2', 'y1', 'y2']

## Stage 2: Cluster inside cluster issue

In [2055]:
raw_clusters['shape'] = abs(raw_clusters['x1'] - raw_clusters['x2']) * abs(raw_clusters['y1'] - raw_clusters['y2'])
raw_clusters.sort_values('shape', inplace=True, ascending=False)

Simple algo. Sort clusters by thier area. For each cluter if their mass center is available on matrix and then fill it's area 

In [2056]:
cluster_list = []
mask = np.zeros(original_img.shape, np.uint8)

for cluster in raw_clusters.values:
    
    x_mass = (cluster[0] + cluster[1]) // 2
    y_mass = (cluster[2] + cluster[3]) // 2

    if mask[y_mass][x_mass] == 0:
        cluster_list.append(cluster[:-1])
        cv2.rectangle(mask, (cluster[1], cluster[3]), (cluster[0], cluster[2]), 255, -1)

In [2057]:
final_clusters = pd.DataFrame(cluster_list).rename({0:'x1', 1:'x2', 2:'y1', 3:'y2'}, axis=1)
final_clusters['id'] = final_clusters.index

# Step 4: Add some order

## Stage 1: Chunk spliting

In [2058]:
cv2.imwrite("../out/transitional.png", raw)

True

In [2059]:
img_width = img.shape[1]

split_vect = np.array([1 for _ in range(img_width)], np.uint8)

splits = pd.DataFrame(raw @ split_vect).rename({0: 'not_free'}, axis=1).reset_index().set_index('index')

In [2060]:
def diff(series):
    return series - series.shift(1, fill_value=0)

split_borders = diff(splits[splits.not_free == 0].index.to_series())
split_borders = split_borders[split_borders > 1].index

In [2061]:
split_borders

Int64Index([28, 439, 458, 480, 520, 539, 581, 630, 652, 693, 711, 732, 760], dtype='int64', name='index')

In [2062]:
final_clusters.shape

(31, 5)

In [2063]:
prev_split = -1

for split in split_borders:
    chunk_clusters = final_clusters[
        (final_clusters['y2'] <= split) & 
        (final_clusters['y1'] >= prev_split)]
    prev_split = split + 1
    # print()
    cv2.line(raw, (0, split), (img_width - 1, split), 255, 1)
    print(chunk_clusters['id'].to_list())

cv2.imwrite('../out/transitional.png', raw)

[22]
[1, 4, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 19, 20, 21, 23]
[11]
[27]
[24]
[10]
[25]
[26]
[18, 28]
[]
[]
[13, 29]
[30]


True

In [2064]:
ordered_id_list = []

In [2065]:


def select_paragraph_order(dataset):
    global CNT
    # print(f'Iteration {CNT}')

    CNT += 1
    
    if dataset.shape[0] == 1:
        # print(f'1 cluster - id[{dataset.iloc[0].id}]')
        return ordered_id_list.append(dataset.iloc[0].id)
    elif dataset.shape[0] == 0:
        # print('0 clusters')
        return

    def diff(series):
        return series - series.shift(1, fill_value=0)

    def get_subrects(split, prev_split, type):
        return dataset[
            (dataset[f'{type}2'] <= split) & 
            (dataset[f'{type}1'] >= prev_split)
        ]

    _xmin = dataset.x1.min()
    _ymin = dataset.y1.min()

    # Prepare dataset
    dataset['x1'] -= _xmin - 1
    dataset['x2'] -= _xmin - 1
    dataset['y1'] -= _ymin - 1
    dataset['y2'] -= _ymin - 1

    _img = np.ones((dataset.y2.max() + 2, dataset.x2.max() + 2), np.uint8) * 0
    
    for rect in dataset.values:
        x1, y1, x2, y2 = rect[:4]
        cv2.rectangle(_img, (y1, y2), (x1, x2), 255, 1)

    # cv2.imwrite('../out/transitional.png', _img)



    # Line segregation
    img_width = _img.shape[1]

    split_vect = np.array([1 for _ in range(img_width)], np.uint8)
    splits = pd.DataFrame(_img @ split_vect).rename({0: 'not_free'}, axis=1).reset_index().set_index('index') 
    split_borders = diff(splits[splits.not_free == 0].index.to_series())
    split_borders = split_borders[split_borders > 1].index
    if len(split_borders) >= 2:
        # print(f'Has horizontal segregation - {split_borders}')
        prev_split = 0

        for split_y in split_borders:
            select_paragraph_order(get_subrects(split_y, prev_split, 'y'))
            # print(f'Next split y {split_y} \n{get_subrects(split_y, prev_split, "y")}')
            prev_split = split_y
        
        return
    # print(f'Not horizontal {split_borders}')
    
    # Column segregation
    img_width = _img.shape[0]

    split_vect = np.array([1 for _ in range(img_width)], np.uint8)
    splits = pd.DataFrame(_img.transpose() @ split_vect).rename({0: 'not_free'}, axis=1).reset_index().set_index('index') 
    split_borders = diff(splits[splits.not_free == 0].index.to_series())
    split_borders = split_borders[split_borders > 1].index
    if len(split_borders) >= 2:
        # print(f'Has vertical segregation - {split_borders}')
        prev_split = 0

        for split_x in split_borders:
            select_paragraph_order(get_subrects(split_x, prev_split, 'x'))
            # print(f'Next split x {split_x} \n{get_subrects(split_x, prev_split, "x")}')
            prev_split = split_x
        
        return 
    # print(f'Not vertical {split_borders}')
    return
    # Variation analysis

    # Case - columns:

    # Case - rows:

In [2066]:
select_paragraph_order(final_clusters.copy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['x1'] -= _xmin - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['x2'] -= _xmin - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['y1'] -= _ymin - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [2068]:
len(ordered_id_list) == len(final_clusters)

True

In [2069]:
cluster_order = pd.DataFrame(ordered_id_list).sort_values(0).reset_index().rename({0:'id', 'index':'order_id'}, axis=1)

In [2070]:
final_clusters['x_mass'] = (final_clusters['x1'] + final_clusters['x2']) / 2
final_clusters['y_mass'] = (final_clusters['y1'] + final_clusters['y2']) / 2

final_clusters = final_clusters.merge(cluster_order)

In [2071]:
final_clusters

Unnamed: 0,x1,x2,y1,y2,id,x_mass,y_mass,order_id
0,66,484,547,629,0,275.0,588.0,24
1,21,154,235,438,1,87.5,336.5,2
2,21,484,659,710,2,252.5,684.5,27
3,66,484,467,519,3,275.0,493.0,20
4,21,148,63,226,4,84.5,144.5,1
5,186,313,295,428,5,249.5,361.5,8
6,350,483,144,236,6,416.5,190.0,11
7,186,316,123,185,7,251.0,154.0,5
8,186,313,225,286,8,249.5,255.5,7
9,350,482,336,386,9,416.0,361.0,15


# Output

In [2072]:
started = True
x_prev, y_prev = None, None

for rect in final_clusters.sort_values('order_id').values:
    x1, y1, x2, y2 = rect[:4].astype(int)
    x_mass, y_mass = rect[5:7].astype(int)
    cv2.rectangle(original_img, (y1, y2), (x1, x2), 0, 1)


    if not started:
        cv2.line(original_img, (x_mass, y_mass), (x_prev, y_prev), 0, 1)

    x_prev = x_mass
    y_prev = y_mass
    
    started = False

cv2.imwrite(OUT_NAME, original_img)
# cv2.imwrite("../out/transitional.png", original_img)


True