In [1]:
# preprocessing the images to make training faster

In [1]:
import skimage
import numpy as np
import os
from tqdm import tqdm
import h5py
from icecream import ic
import matplotlib.pyplot as plt

In [5]:
def preprocessing(image_path, shape):
    # opens the image
    img = skimage.io.imread(image_path)

    #converts it to grayscale if needed
    if len(img.shape) == 3:
        #reshape image to 1d
        if img.shape[2] == 4:
            img = skimage.color.rgba2rgb(img)
        #if img.shape[2] == 3:
            #img = skimage.color.rgb2gray(img)
    else:
        img = np.dstack([img,img,img])
    
    # expands the image to a square shape
    height, width = img.shape[0],img.shape[1]
    if width == height:
        pass
    elif width > height:
        missing = int((width - height) / 2)
        img = np.pad(img, ((missing, missing), (0, 0),(0, 0)))
    else:
        missing = int((height - width) / 2)
        img = np.pad(img, ((0, 0), (missing, missing),(0, 0)))

    # resizes the image to shape x shape
    img = skimage.transform.resize(img, (shape, shape), anti_aliasing=True)

    return np.array(img)

# the dataset i used for the mark classifier

In [16]:
#calculating the amount of valid samples i have
n_samples = 0
for file in tqdm(os.listdir(os.path.join("data","the_cleanest_data"))):
        for f in os.listdir(os.path.join("data","the_cleanest_data", file)):
            n_samples += 1
n_samples

100%|██████████| 4/4 [00:00<00:00,  7.56it/s]


1496

In [17]:
#creating indices for the sets
shuffled_indices = np.random.permutation(n_samples)
testset_inds = shuffled_indices[:int(n_samples / 10)*2]
trainingset_inds = shuffled_indices[int(n_samples / 10) * 2:]

#going through all files, preprocess it and put it in the right set
testset = []
trainset = []
x = 0
num = 0 #num is basically a index which goes over every sample
output_size = 2**10


#this loop looks at the scraped examples
for i, file in enumerate(os.listdir(os.path.join("data","the_cleanest_data"))):
    print(i)
    for f in tqdm(os.listdir(os.path.join("data","the_cleanest_data", file))):
        image_path = os.path.join("data","the_cleanest_data", file,f)
        preprocessed_img = preprocessing(image_path,output_size)

        if file == "Bar":
            label = 0
        if file == "Line":
            label = 1
        if file == "Point":
            label = 2
        if file == "Area":
            label = 3

        #converting the image to int8 to save memory
        uint = (preprocessed_img*255).astype(np.uint8)

        flat_img = np.array(uint.flatten(), dtype=np.uint8)
        flat_label = np.array([label],dtype=np.uint8)
        img_label = np.concatenate([flat_img, flat_label],dtype=np.uint8)

        if num in testset_inds:
            testset.append(img_label)
        elif num in trainingset_inds:
            trainset.append(img_label)

        num+=1



ic(len(trainset),len(testset))

0


100%|██████████| 153/153 [01:38<00:00,  1.55it/s]


1


100%|██████████| 501/501 [05:27<00:00,  1.53it/s]


2


100%|██████████| 550/550 [05:46<00:00,  1.59it/s]


3


100%|██████████| 292/292 [03:02<00:00,  1.60it/s]
ic| len(trainset): 1198, len(testset): 298


(1198, 298)

In [5]:
#looking at the balance of the data
bar = 0
line = 0
point = 0
area = 0
for i, file in enumerate(os.listdir(os.path.join("data","the_cleanest_data"))):
    for f in tqdm(os.listdir(os.path.join("data","the_cleanest_data", file))):

        if file == "Bar":
            label = 0
            bar +=1
        if file == "Line":
            label = 1
            line  +=1
        if file == "Point":
            label = 2
            point  +=1
        if file == "Area":
            label = 3
            area  +=1

       



ic(bar,line,point,area)

100%|██████████| 153/153 [00:00<?, ?it/s]
100%|██████████| 501/501 [00:00<00:00, 502594.19it/s]
100%|██████████| 550/550 [00:00<00:00, 550302.29it/s]
100%|██████████| 292/292 [00:00<00:00, 291881.98it/s]
ic| bar: 501, line: 550, point: 292, area: 153


(501, 550, 292, 153)

In [18]:
#saving the dataset
train_arrays = np.array(trainset, dtype=np.uint8)
test_arrays = np.array(testset, dtype=np.uint8)
with h5py.File(r"data\1024_4labels_clean.h5py","w") as f:
    f.create_dataset(r"testset", data=test_arrays)
    f.create_dataset(r"trainset", data=train_arrays)
print("everything completed")

everything completed
