In [2]:
from PIL import Image, ImageFilter, ImageOps 
from collections import deque as queue
import matplotlib.pyplot as plt
import numpy as np
import os
from io import BytesIO
        
dir_path = './images/'
clean_path = './clean_images/'
clean_border_path = './clean_images_with_border/'

file_names = os.listdir(dir_path)
file_names.sort()

# Preprocess Images

## Load Images

In [3]:
images = [Image.open(dir_path + x) for x in file_names]
gray_images = [image.convert('L') for image in images]

## Convert to B/W

In [4]:
# Returns the numpy filter for the boxing of rows
def get_range_filter(array, split_size):
    first_true = 0
    last_true = len(array)
    border = len(array) // 2

    for i in range(border):
        if int(array[i + 1]) - int(array[i]) > split_size:
            first_true = i

    for i in range(border, len(array) - 1):  
        if int(array[i]) - int(array[i + 1]) > split_size:
            last_true = i
            break

    filter = np.array([False] * len(array))
    filter[first_true:last_true] = True
    return filter

In [5]:
clean_arrays = []
clean_border_images = []
for index, image in enumerate(gray_images):
    array = np.asarray(image)

    # Filter along columns
    col_maxs = np.amax(array, axis=0)
    col_filter = col_maxs > np.mean(col_maxs)
    array = array[:, col_filter]

    # Filter along rows - loop approach
    row_maxs = np.amax(array, axis=1)
    row_filter = get_range_filter(row_maxs, 40)
    array = array[row_filter, :]

    # Make brightness uniform
    x = np.arange(len(array))
    y = np.mean(array, axis=1)

    mymodel = np.poly1d(np.polyfit(x, y, 2))
    coeffs = mymodel.__dict__['coeffs'][::-1]

    avg_pts = np.array([coeffs[0] + coeffs[1] * i + coeffs[2] * i * i for i in range(len(x))])
    target = (avg_pts - np.mean(avg_pts)) * 0.7
    new_arr = np.array([[0] * len(array[0])] * len(x))

    for i in range(len(x)):
        new_arr[i] = array[i] - target[i]

    # Make black and white
    thresh = new_arr.mean() - new_arr.std()
    array = (new_arr > thresh) * 255

    # Store the image
    clean_arrays.append(array.astype('ubyte'))
    clean_border_images.append(Image.fromarray(clean_arrays[-1]))
    print(index + 1, "/", len(gray_images))

1 / 64
2 / 64
3 / 64
4 / 64
5 / 64
6 / 64
7 / 64
8 / 64
9 / 64
10 / 64
11 / 64
12 / 64
13 / 64
14 / 64
15 / 64
16 / 64
17 / 64
18 / 64
19 / 64
20 / 64
21 / 64
22 / 64
23 / 64
24 / 64
25 / 64
26 / 64
27 / 64
28 / 64
29 / 64
30 / 64
31 / 64
32 / 64
33 / 64
34 / 64
35 / 64
36 / 64
37 / 64
38 / 64
39 / 64
40 / 64
41 / 64
42 / 64
43 / 64
44 / 64
45 / 64
46 / 64
47 / 64
48 / 64
49 / 64
50 / 64
51 / 64
52 / 64
53 / 64
54 / 64
55 / 64
56 / 64
57 / 64
58 / 64
59 / 64
60 / 64
61 / 64
62 / 64
63 / 64
64 / 64


## Clean Borders

In [6]:
def recolor_border(arr, vis, from_color, to_color, start):
    (x, y) = start
    dRow = [ -1, 0, 1, 0 ]
    dCol = [ 0, 1, 0, -1 ]
    
    q = queue()
    
    valid_index = lambda x, y, color: (x >= 0 and y >= 0 and x < len(arr) and y < len(arr[0]) and arr[x][y] == color)
    
    q.append(start)
    vis[x][y] = True
    
    while len(q) > 0:
        (x, y) = q.popleft()
        arr[x][y] = to_color
        
        for i in range(4):
            new_x = x + dRow[i]
            new_y = y + dCol[i]
            if valid_index(new_x, new_y, from_color) and not vis[new_x][new_y]:
                q.append((new_x, new_y))
                vis[new_x][new_y] = True
                
clean_images = []
for ind, arr in enumerate(clean_arrays):
    arr = arr.copy()
    rows = len(arr)
    cols = len(arr[0])
    vis = np.resize(np.array([False]), (len(arr), len(arr[0])))
    
    for i in range(rows):
        recolor_border(arr, vis, 0, 255, (i, 0))
        recolor_border(arr, vis, 0, 255, (i, cols - 1))
    for j in range(cols):
        recolor_border(arr, vis, 0, 255, (rows - 1, j))
        recolor_border(arr, vis, 0, 255, (0, j))
        
    clean_images.append(Image.fromarray(arr))
    print(ind + 1, "/", len(clean_arrays))

1 / 64
2 / 64
3 / 64
4 / 64
5 / 64
6 / 64


KeyboardInterrupt: 

## Save clean images

In [7]:
for i, clean_image in enumerate(clean_border_images):
    clean_image.save(clean_border_path + file_names[i], "PNG")

#for i, clean_image in enumerate(clean_images):
#    clean_image.save(clean_path + file_names[i], "PNG")

# Image Segmentation

## Load Saved Images

In [None]:
clean_images = [Image.open(clean_path + x) for x in file_names]

## Split Characters

### Function to Split an image

In [None]:
def split_image(img_arr, vertical: bool = False):
    if vertical: img_arr = np.transpose(img_arr)
    
    rows = []
    prev_sum = 0
    
    for i, row_sum in enumerate(img_arr.sum(axis=1)):
        if row_sum == 0:
            prev_sum = row_sum
            continue
        
        if prev_sum == 0:
            rows.append([img_arr[i]])
        else:
            rows[-1].append(img_arr[i])
        
        prev_sum = row_sum

    ans = []
    for row in rows:
        ans.append(np.array(row))
        if vertical: ans[-1] = np.transpose(ans[-1])
    
    return ans

### Split rows

In [None]:
# row_arr[i][j]: page i, row j: 4D array
row_arr = []

for clean_image in clean_images:
    arr = np.asarray(clean_image)
    
    # convert to black background in case we have white background
    if arr.mean() > 128:
        arr = 255 - arr
    
    returned_rows = split_image(arr)
    row_arr.append(returned_rows)

### Remove Extra Rows

In [244]:
# HARD CODED VALUE DEDUCED FROM OUTPUT OF CODE IN NEXT CELL
row_min_height = 23 + 1

row_arr_temp = []
for page in row_arr:
    row_arr_temp.append([])
    for row in page:
        if row.shape[0] < row_min_height:
            continue
        row_arr_temp[-1].append(row)

row_arr = row_arr_temp

In [None]:
row_height = {}
temp = []
for page in row_arr:
    for row in page:
        height = row.shape[0]
        row_height[height] = row_height.get(height, 0) + 1
        if height == 23:
            temp.append(row)

# sorted(row_height.items(), key=lambda x: x[0], reverse=False
# print(len(temp))
# Image.fromarray(temp[2])

x = []
y = []

for a, b in row_height.items():
    x.append(a)
    y.append(b)

plt.scatter(x, y)
plt.plot()

### Split Words

In [None]:
# col_arr[i][j][k]: page i, row j, word k: 5D array
word_arr = []

for i, page in enumerate(row_arr):
    word_arr.append([])
    for row in page:
        # Blur row so that word can be captured instead of individual character
        temp = BytesIO()
        Image.fromarray(row).save(temp, "jpeg")
        row = np.asarray(Image.open(temp))
        
        # Load blurred input
        word_arr[-1].append(split_image(row, True))
        
    print(i + 1, "/", len(row_arr))