In [1]:
import os
from PIL import Image, ImageOps
import pandas as pd
import numpy as np
import joblib
import cv2

In [2]:
PATH = "tmp_stian/" #Folders with images

In [3]:
def transform_images(path):
    """ 
    Transforms all provided images by mirroring and rotation. Requires os and PIL-package. Deletes original image files.

    Parameters: 
    path (string): Path to directory where all image folders that should be transformed are.
    """
    
    for directory in os.listdir(path):
        if directory[0] == ".": #Dont traverse folders such as .DS_Store
            continue
        for image in os.listdir(path+directory):
            if image.endswith(".png"):  
                img = Image.open(path+directory + "/" + image)
                mirrored_img = ImageOps.mirror(img)
                for i in [0, 45, 90, 135, 180, 225, 270, 315]:
                    img.rotate(i).save(path+directory + "/" +str(image[:-4]) + "_r" + str(i) + ".png")
                    mirrored_img.rotate(i).save(path+directory + "/" +str(image[:-4]) + "_r" + str(i) + "_m.png")
                if os.path.exists(path+directory + "/" + image):
                        os.remove(path+directory + "/" + image)

In [6]:
def create_dataset(path):
    """ 
    Creates a pandas dataframe containing vectorized 50x50 binary images with labels. Indexed by filepath + filename
    
    Parameters: 
    path (string): Path to directory where all image folders are
    """
    
    file_names = [] #Indexes
    files = [] #Vectorized images
    counts = [] #Label
    units = ["zero", "one", "two", "three", "four", "five"] 
    for directory in os.listdir(path):
        if directory[0] == ".":
            continue
        else:
            count = units.index(directory.split("_")[0]) #Convert number from string to integer
        for file in os.listdir(path+directory):
            filename = os.fsdecode(file)
            if filename.endswith(".png"):
                file_names.append(directory+"/"+filename)
                counts.append(count)
                f = np.ravel(cv2.imread(path + directory +"/"+ filename, 0))%254 #Vectorize and convert to 0 or 1 val
                files.append(f.tolist())

    return pd.DataFrame({"image": files, "count":counts}, index=file_names)

In [4]:
transform_images(PATH)

In [7]:
dataset = create_dataset(PATH)

In [8]:
dataset.head()

Unnamed: 0,image,count
one_1/341_r90_m.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
one_1/319_r180_m.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
one_1/342_r315_m.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
one_1/286_r225_m.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
one_1/340_r45.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


In [9]:
len(dataset)

6528

In [10]:
joblib.dump(dataset, "stian_dataset_all") #Store the dataset as a python object

['stian_dataset_all']