In [9]:
import cv2 as cv
import numpy
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

In [10]:
def load_and_process_image(path):
    img = cv.imread(path)
    img_gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    img_invert = cv.bitwise_not(img_gray)
    return img_invert

def find_and_crop_bounding_box(image):
    ret, thresh = cv.threshold(image, 127, 255, 0)
    contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)

    #In case of multiple contours, take one with the biggest area
    x, y, width, height = cv.boundingRect(sorted(contours, key=cv.contourArea)[-1])
    image = image[y:y+height, x:x+width]
    return image

def vectorize(path):
    img = load_and_process_image(path)
    img = find_and_crop_bounding_box(img)
    img = cv.resize(img, (30, 30))
    img = img.reshape(900,)
    return img

In [11]:
label_map = {
    '+': 10,
    '-': 11,
    'times': 12,
    'forward_slash': 13,
    '(': 14,
    ')': 15
}

dataset = []

for symbol in os.listdir("dataset/"):
    try:
        label = int(symbol)
    except:
        label = label_map[symbol]
    
    print('Processing: {}'.format(symbol))
    for image in tqdm(os.listdir(os.path.join('dataset', symbol))):
        image_path = os.path.join('dataset', symbol, image)
        features = vectorize(image_path)
        dataset.append('{};{};{}'.format(features, image_path, label))

  1%|▏         | 44/3068 [00:00<00:06, 439.47it/s]

Processing: 8


100%|██████████| 3068/3068 [00:04<00:00, 690.88it/s]
  0%|          | 57/26520 [00:00<00:46, 568.44it/s]

Processing: 1


100%|██████████| 26520/26520 [00:42<00:00, 627.46it/s]
  0%|          | 64/26141 [00:00<00:40, 639.55it/s]

Processing: 2


100%|██████████| 26141/26141 [01:10<00:00, 373.19it/s]
  1%|          | 61/6914 [00:00<00:11, 604.36it/s]

Processing: 0


100%|██████████| 6914/6914 [00:10<00:00, 667.51it/s]
  2%|▏         | 50/3118 [00:00<00:06, 494.75it/s]

Processing: 6


100%|██████████| 3118/3118 [00:04<00:00, 640.43it/s]
  0%|          | 48/10909 [00:00<00:22, 474.78it/s]

Processing: 3


100%|██████████| 10909/10909 [00:17<00:00, 640.37it/s]
  0%|          | 42/25112 [00:00<01:00, 414.95it/s]

Processing: +


100%|██████████| 25112/25112 [00:39<00:00, 628.13it/s]
 34%|███▍      | 68/199 [00:00<00:00, 677.56it/s]

Processing: forward_slash


100%|██████████| 199/199 [00:00<00:00, 689.01it/s]
  2%|▏         | 58/3737 [00:00<00:06, 579.81it/s]

Processing: 9


100%|██████████| 3737/3737 [00:05<00:00, 663.97it/s]
  0%|          | 39/33997 [00:00<01:28, 385.43it/s]

Processing: -


100%|██████████| 33997/33997 [00:56<00:00, 602.93it/s]
  0%|          | 41/14294 [00:00<00:35, 398.59it/s]

Processing: (


100%|██████████| 14294/14294 [00:24<00:00, 577.55it/s]
  2%|▏         | 58/2909 [00:00<00:04, 576.76it/s]

Processing: 7


100%|██████████| 2909/2909 [00:04<00:00, 604.65it/s]
  1%|▏         | 49/3545 [00:00<00:07, 487.29it/s]

Processing: 5


100%|██████████| 3545/3545 [00:05<00:00, 604.14it/s]
  1%|          | 44/7396 [00:00<00:16, 434.55it/s]

Processing: 4


100%|██████████| 7396/7396 [00:12<00:00, 596.88it/s]
  2%|▏         | 63/3251 [00:00<00:05, 625.55it/s]

Processing: times


100%|██████████| 3251/3251 [00:05<00:00, 619.15it/s]
  0%|          | 45/14355 [00:00<00:32, 446.85it/s]

Processing: )


100%|██████████| 14355/14355 [00:24<00:00, 596.96it/s]


In [12]:
with open('dataset.csv', 'w') as out:
    out.write('\n'.join(dataset))