Skip to content

Commit

Permalink
improved OCR and colname-column sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
Chilipp committed Dec 11, 2018
1 parent 82f2922 commit 42ca7ab
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 19 deletions.
38 changes: 31 additions & 7 deletions straditize/colnames.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ def bounds(self):
def extents(self):
return sorted([self.x0, self.x1]) + sorted([self.y0, self.y1])

@property
def crop_extents(self):
"""The extents necessary for PIL.Image.crop"""
return self.left, self.top, self.right, self.bottom

@property
def corners(self):
return np.array([
Expand Down Expand Up @@ -364,6 +369,10 @@ def rotate_image(self, image):
return ret

def recognize_text(self, image):
if tesseract_version is None:
raise ImportError(
"tesseract is required but could not be found! Make sure, the "
"directory of its executable is in your PATH variable.")
fname = tempfile.NamedTemporaryFile(
suffix='.png', prefix='stradi_').name
fname2 = tempfile.NamedTemporaryFile(
Expand All @@ -388,14 +397,18 @@ def get_overlap(col, box):
xmax = self.transform_point(
box.x0 + x0, box.y1 + y0, invert=True, image=image)[0]
xmin, xmax = sorted([xmin, xmax])
return min(e, xmax) - max(s, xmin)
return max(min(e, xmax) - max(s, xmin), 0)

def vbox_distance(b1, b2):
if b1.left > b2.right or b1.right < b2.left:
return np.inf # no overlap
return min(abs(b1.top - b2.bottom), abs(b2.top - b1.bottom))

if tesserocr is None:
raise ImportError("tesserocr module not found!")

bounds = self.column_bounds
cols = list(range(len(bounds)))
hr = self.highres_image is not None
rotated = self.rotated_image
if hr:
Expand Down Expand Up @@ -424,15 +437,25 @@ def vbox_distance(b1, b2):
*extents[:2], image=self.highres_image if hr else self.image,
invert=True)

if tesseract_version.startswith('4.0.'):
# LC_ALL might have been changed by some other module, so we set
# it here again to "C"
import locale
locale.setlocale(locale.LC_ALL, 'C')

with tesserocr.PyTessBaseAPI() as api:
api.SetImage(rgba2rgb(image))
im_boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True)
texts = {}
images = {}
for i, (im, d, _, _) in enumerate(im_boxes):
box = Bbox(d['x'], d['y'], d['w'], d['h'])
api.SetRectangle(*box)
text = api.GetUTF8Text().strip()
box = Bbox(**d)
if not any(get_overlap(col, box) for col in cols):
continue
# expand the image to improve text recognition
im = ImageOps.expand(rgba2rgb(image.crop(box.crop_extents)),
int(im.size[1] / 2.), (255, 255, 255))
text = tesserocr.image_to_text(im).strip()
if len(text) >= 3:
texts[box] = text
images[box] = im.convert('RGBA')
Expand All @@ -448,23 +471,24 @@ def vbox_distance(b1, b2):
for b1, t in list(texts.items()):
if b1 in merged:
continue
col = max(cols, key=partial(get_overlap, box=b1))
for b2, t in list(texts.items()):
if (b1 is b2 or b2 in merged or
if (b1 is b2 or b2 in merged or not get_overlap(col, b2) or
vbox_distance(b1, b2) > 0.5*em):
continue
merged.update([b1, b2])
box = Bbox(min(b1.x, b2.x), min(b1.y, b2.y),
max(b1.x1, b2.x1) - min(b1.x0, b2.x0),
max(b1.y0, b2.y0) - min(b1.y1, b2.y1))
texts[box] = texts[b1] + ' ' + texts[b2]
images[box] = image.crop([box.x0, box.y1, box.x1, box.y0])
images[box] = image.crop(box.crop_extents)
b1 = box
for b in merged:
del texts[b], images[b]

# get a mapping from box to column from the overlap
boxes = dict(filter(
lambda t: get_overlap(*t) > 0,
lambda t: get_overlap(*t),
((col, max(texts, key=partial(get_overlap, col)))
for col in range(len(bounds)))))
x0, y0 = extents[:2]
Expand Down
21 changes: 13 additions & 8 deletions straditize/widgets/colnames.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ def update_image(self, *args, **kwargs):
self.plot_colpic()
self.btn_select_colpic.setText('Apply')
self.btn_cancel_colpic_selection.setVisible(True)
self.btn_recognize.setEnabled(True)
self.btn_recognize.setEnabled(
self.should_be_enabled(self.btn_recognize))

def highlight_selected_col(self):
draw = False
Expand All @@ -309,7 +310,8 @@ def highlight_selected_col(self):
if colpic is not None:
self.colpic_im = self.colpic_ax.imshow(colpic)
self.colpic_canvas.draw()
self.btn_recognize.setEnabled(colpic is not None)
self.btn_recognize.setEnabled(
self.should_be_enabled(self.btn_recognize))
draw = True
else:
self.btn_select_colpic.setEnabled(False)
Expand All @@ -330,8 +332,13 @@ def should_be_enabled(self, w):
ret = self.straditizer is not None and getattr(
self.straditizer.data_reader, '_column_starts', None) is not None
if ret and w is self.btn_find:
ret = (self.cb_find_all_cols.isChecked() or
self.current_col is not None)
from straditize.colnames import tesserocr
ret = tesserocr is not None and (
self.cb_find_all_cols.isChecked() or
self.current_col is not None)
elif ret and w is self.btn_recognize:
from straditize.colnames import tesseract_version
ret = tesseract_version is not None and self.colpic is not None
return ret

def toggle_dialog(self):
Expand Down Expand Up @@ -386,7 +393,8 @@ def refresh(self):
self.btn_load_image.blockSignals(True)
self.btn_load_image.setChecked(checked)
self.btn_load_image.blockSignals(False)
self.btn_recognize.setEnabled(self.colpic is not None)
self.btn_recognize.setEnabled(
self.should_be_enabled(self.btn_recognize))
else:
self.colnames_table.setRowCount(0)
self.remove_images()
Expand Down Expand Up @@ -511,9 +519,6 @@ def _find_colnames(self):

def find_colnames(self, warn=True, full_image=False, all_cols=None):
"""Find the column names automatically"""
from straditize.colnames import tesserocr
if tesserocr is None:
raise ImportError("tesserocr module not found!")
ys, xs = self.im_rotated.get_size()
x0, x1 = self.main_ax.get_xlim() if not full_image else (0, xs)
y0, y1 = sorted(self.main_ax.get_ylim()) if not full_image else (0, ys)
Expand Down
Binary file modified tests/test_figures/colnames_diagram-colnames.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 5 additions & 4 deletions tests/widgets/test_colnames.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ def test_find_all_column_names(self):
self.assertIsNone(sw.colnames_manager.find_colnames(warn=False,
full_image=True))

self.assertEqual(
list(reader.column_names),
['Charcoal', 'Pinus', 'Juniperus', 'Plantago coronopus',
'Pteridium', 'Filicales', 'Pollen Concentration'])
# test whether we found all column names (whether they are correct or
# not...)
self.assertFalse(
set(map(str.lower, reader.column_names)).intersection(
map(str, range(len(reader.column_names)))))

@unittest.skipIf(tesserocr is None, "requires tesserocr")
def test_find_one_column_name(self):
Expand Down

0 comments on commit 42ca7ab

Please sign in to comment.