-
Notifications
You must be signed in to change notification settings - Fork 0
/
alignment.py
executable file
·169 lines (140 loc) · 6.56 KB
/
alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import cv2
import numpy as np
import json
import shutil
from segmentation import page_to_lines, line_to_words, remove_left_margin
from alignment_errors import AlignmentException
from collections import deque, OrderedDict
from text_processing import estimate_word_width_simple
TSC_DIR = 'gt_transcriptions'
IMG_DIR = 'page_images_crop'
TSC_EXT = '.txt'
IMG_EXT = '.png'
AVG_CHARSIZES = json.load(open('average_charsizes.json'))
def max_bbx(bbxs):
x1 = min([x for x, _, _, _ in bbxs])
y1 = min([y for _, y, _, _ in bbxs])
x2 = max([x + w for x, _, w, _ in bbxs])
y2 = max([y + h for _, y, _, h in bbxs])
return x1, y1, x2 - x1, y2 - y1
if __name__ == '__main__':
dst_dir = 'aligned'
tsc_dir_name = 'transcriptions'
img_dir_name = 'images'
fnms = [f.split('.')[0] for f in os.listdir(IMG_DIR)]
alignment = dict()
for fnm in fnms:
print(fnm)
if os.path.isdir(os.path.join(dst_dir, fnm)):
shutil.rmtree(os.path.join(dst_dir, fnm))
tsc_dst_dir = os.path.join(dst_dir, fnm, tsc_dir_name)
img_dst_dir = os.path.join(dst_dir, fnm, img_dir_name)
os.makedirs(tsc_dst_dir)
os.makedirs(img_dst_dir)
# load page image
page_img_og = cv2.imread(os.path.join(IMG_DIR, fnm + IMG_EXT), cv2.IMREAD_GRAYSCALE)
# cut capital letters at margin
page_img, left_margin = remove_left_margin(page_img_og)
# segment page image into lines
img_lines = page_to_lines(page_img)
# load GT transcription
with open(os.path.join(TSC_DIR, fnm.split('_')[0] + TSC_EXT), 'r') as tsc_file:
tsc_lines = tsc_file.readlines()
if len(img_lines) != len(tsc_lines):
raise AlignmentException(
"Line mismatch: {} lines segmented, but transcription has {} lines"
.format(len(img_lines), len(tsc_lines))
)
# alignment begins
split_th = 0.25 # the higher, the more we are conservative about splitting
merge_th = 0.3
alignment[fnm] = dict()
c = 0
print("STOP AT 5")
for tsc_line, (img_line, top_y) in zip(tsc_lines[:], img_lines[:]):
# word_imgs is a double-ended queue containing computed word segmentations.
word_imgs = deque(line_to_words(img_line, top_y))
for tsc_word in tsc_line.split():
estimated_width = estimate_word_width_simple(tsc_word)
try:
word_img, word_img_x, word_img_y = word_imgs.popleft()
except IndexError:
print('Alignment for', tsc_word, 'failed: no word images left')
break
if word_img.shape[1] >= estimated_width * (1 + split_th):
# word image is wider than expected:
# we split it into two sub-images, and align tsc to the leftmost.
black_px_per_column = np.count_nonzero(cv2.bitwise_not(word_img), axis=0)
interval = black_px_per_column[
int(estimated_width * (1 - split_th)):int(estimated_width * (1 + split_th))
]
split_point = np.argmin(interval) + int(estimated_width * (1 - split_th))
word_img_cur = word_img[:, :split_point]
print('case split:', tsc_word)
alignment[fnm][(word_img_x + left_margin - 2,
word_img_y,
word_img_cur.shape[1],
word_img_cur.shape[0])] = tsc_word
word_img_next = word_img[:, split_point:]
word_imgs.appendleft((word_img_next, word_img_x + split_point, word_img_y))
elif word_img.shape[1] <= estimated_width * (1 - merge_th):
# word image is shorter than expected:
# we merge it with following word images.
to_combine = [(word_img_x, word_img_y, word_img.shape[1], word_img.shape[0])]
while max_bbx(to_combine)[2] <= estimated_width * (1 - merge_th):
try:
word_img, word_img_x, word_img_y = word_imgs.popleft()
except IndexError:
print('Alignment for', tsc_word, 'failed: no word images left')
break
to_combine.append(
(word_img_x, word_img_y, word_img.shape[1], word_img.shape[0])
)
w_x, w_y, w_w, w_h = max_bbx(to_combine)
print('case merge:', tsc_word)
alignment[fnm][(w_x + left_margin - 2, w_y, w_w, w_h)] = tsc_word
else:
# align
print('case aligned:', tsc_word)
alignment[fnm][
(word_img_x + left_margin - 2, word_img_y, word_img.shape[1], word_img.shape[0])] = tsc_word
print()
for ((bb_x, bb_y, bb_w, bb_h), tsc) in alignment[fnm].items():
cv2.imwrite(os.path.join(
img_dst_dir,
'{}_{}_{}_{}.png'.format(bb_x, bb_y, bb_w, bb_h)
),
page_img_og[bb_y:bb_y + bb_h, bb_x:bb_x + bb_w]
)
with open(os.path.join(
tsc_dst_dir,
'{}_{}_{}_{}.txt'.format(bb_x, bb_y, bb_w, bb_h)), 'w') as f:
f.write(tsc)
for ((bb_x, bb_y, bb_w, bb_h), tsc) in alignment[fnm].items():
cv2.rectangle(
page_img_og,
(bb_x, bb_y),
(bb_x + bb_w, bb_y + bb_h),
192,
1
)
cv2.putText(
page_img_og,
tsc,
(bb_x, bb_y),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
64,
1,
cv2.LINE_AA
)
cv2.imwrite(fnm + '_TEXT.png', page_img_og)
bounding_boxes = OrderedDict(
[(page_k,
OrderedDict([('_'.join(map(str, bbx_k)), tsc_v)
for bbx_k, tsc_v in sorted(bbxs.items(), key=lambda x: (x[0][1], x[0][0]))]))
for page_k, bbxs in alignment.items()]
)
with open('bounding_boxes_elena.json', 'w') as bb_f:
bb_f.write(json.dumps(bounding_boxes, indent=2))