In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image, ImageChops
import seaborn as sns
import os
from tqdm.auto import tqdm
import time, gc
import collections as coll
import matplotlib.pyplot as plt
import random
import cv2
from matplotlib.font_manager import FontProperties

In [2]:
train_images_0 = pq.read_table('train_image_data_0.parquet').to_pandas()
train_images_1 = pq.read_table('train_image_data_1.parquet').to_pandas()
train_images_2 = pq.read_table('train_image_data_2.parquet').to_pandas()
train_images_3 = pq.read_table('train_image_data_3.parquet').to_pandas()

In [3]:
train_images = train_images_0.append(train_images_1)
train_images = train_images.append(train_images_2)
train_images = train_images.append(train_images_3)

In [4]:
def crop_surrounding_whitespace(image):
    """Remove surrounding empty space around an image.

    This implemenation assumes that the surrounding empty space 
    around the image has the same colour as the top leftmost pixel.

    :param image: PIL image
    :rtype: PIL image (cropped)
    """
    bg = Image.new(image.mode, image.size, image.getpixel((0,0)))
    diff = ImageChops.difference(image, bg)
    diff = ImageChops.add(diff, diff, 2, -50)
    bbox = diff.getbbox()
    return image.crop(bbox)

In [5]:
images = []
for i in range(len(train_images)):
    img = train_images.iloc[i,1:].to_numpy().astype(int).reshape(137,236)
    img = Image.fromarray(img.astype(np.uint8))
    images.append(crop_surrounding_whitespace(img))

In [6]:
len(images)

In [7]:
rows = [x.size[0] for x in images]
cols = [x.size[1] for x in images]

In [8]:
print(np.mean(rows))
print(np.median(rows))
print(np.max(rows))
plt.hist(rows, bins='auto')
plt.show()

In [9]:
print(np.mean(cols))
print(np.median(cols))
print(np.max(cols))
plt.hist(cols, bins='auto')
plt.show()

In [10]:
num_rows = int(np.median(rows)) # 106
num_cols = int(np.median(cols)) # 87
# (106, 87)
for i in range(len(train_images)):
    images[i] = images[i].resize((num_rows, num_cols), Image.ANTIALIAS)
    images[i] = np.array(images[i])
    images[i] = cv2.fastNlMeansDenoising(images[i], h=3)
    images[i] = cv2.threshold(images[i], 200, 1, cv2.THRESH_BINARY)[1]

In [11]:
for i in range(len(train_images)):
    images[i] = images[i].flatten()

In [12]:
df = pd.DataFrame(images)
table_from_pandas = pa.Table.from_pandas(df)
pq.write_table(table_from_pandas, 'berrybengali_preprocessed.parquet')