In [21]:
from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from PIL import Image
import shutil
from skimage.filters import threshold_otsu, rank
from skimage.morphology import dilation, disk, erosion
from skimage import feature
import random

preceding_path = "/blue/eel5840/justin.rossiter"
folder_name = "final_project_team_square_root"

plt.ion()   # interactive mode
#https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

def generateImageAndLabel(split, i, idx):
    image_grayscale = dataset[:, :, idx]
    image_bounding = getImageBounding(image_grayscale)
    x_mu, y_mu, width, height = getMuAndDimsThroughEdges(image_bounding)
    image = Image.fromarray(image_grayscale, mode = 'L').resize((320, 320))
    image.save(os.path.join(preceding_path, folder_name, split, "images", str(i) + "_" + str(idx) + ".jpg"))
    with open(os.path.join(preceding_path, folder_name, split, "labels", str(i) + "_" + str(idx) + ".txt"), "w") as f:
        f.write(str(i) + " " + str(x_mu) + " " + str(y_mu) + " " + str(width) + " " + str(height))
        
def generateAugmentedData(iteration, split):
    size = random.randint(2, 4)
    image = Image.new('L', (size*300, 300))
    i_arr = np.zeros((size, ))
    x_mu_arr = np.zeros((size, ))
    y_mu_arr = np.zeros((size, ))
    width_arr = np.zeros((size, ))
    height_arr = np.zeros((size, ))
    for i in range(size):
        if split == "train":
            idx = random.randint(0, len(train_indices) - 1)
            idx = train_indices[idx]
        elif split == "val":
            idx = random.randint(0, len(val_indices) - 1)
            idx = val_indices[idx]
        image_temp_arr = dataset[:, :, idx]
        image_temp = Image.fromarray(image_temp_arr, mode = 'L')
        image.paste(image_temp, (i*300, 0))
        i_arr[i] = labels[idx]
        if i_arr[i] == -1:
            i_arr[i] = 10
        image_bounding = getImageBounding(image_temp_arr)
        left_edge, right_edge, top_edge, bottom_edge = getEdges(image_bounding)
        left_edge += 300*i
        right_edge += 300*i
        x_mu, y_mu, width, height = getMuAndDims(left_edge, right_edge, top_edge, bottom_edge, 300*size, 300)
        x_mu_arr[i] = x_mu
        y_mu_arr[i] = y_mu
        width_arr[i] = width
        height_arr[i] = height
    image.save(os.path.join(preceding_path, folder_name, split, "images", "augmented_" + str(iteration) + ".jpg"))
    with open(os.path.join(preceding_path, folder_name, split, "labels", "augmented_" + str(iteration) + ".txt"), "w") as f:
        for idx in range(size):
            f.write(str(i_arr[idx]) + " " + str(x_mu_arr[idx]) + " " + str(y_mu_arr[idx]) + " " + str(width_arr[idx]) + " " + str(height_arr[idx]) + "\n")
        
def getImageBounding(image_grayscale):
    radius = 15 #Otsu thresholding
    footprint = disk(radius)
    threshold_global_otsu = threshold_otsu(image_grayscale)
    global_otsu = image_grayscale >= threshold_global_otsu
    image_dilated = dilation(global_otsu) #Dilation
    image_erosion = erosion(image_dilated) #Image Erosion
    image_bounding = 1-image_erosion
    return image_bounding
        
def getMuAndDimsThroughEdges(image_bounding):
    left_edge, right_edge, top_edge, bottom_edge = getEdges(image_bounding)
    width = (right_edge - left_edge)/300.0
    height = (bottom_edge - top_edge)/300.0
    x_mu = (right_edge + left_edge)/600.0
    y_mu = (bottom_edge + top_edge)/600.0
    return x_mu, y_mu, width, height

def getMuAndDims(left_edge, right_edge, top_edge, bottom_edge, img_width, img_height):
    width = (right_edge - left_edge)/(img_width)
    height = (bottom_edge - top_edge)/(img_height)
    x_mu = (right_edge + left_edge)/(2*img_width)
    y_mu = (bottom_edge + top_edge)/(2*img_height)
    return x_mu, y_mu, width, height

def getEdges(image_bounding):
    left_edge = np.where(np.array([np.sum(image_bounding[:, i]) for i in range(300)]) > 0)[0]
    right_edge = np.where(np.array([np.sum(image_bounding[:, i]) for i in range(300)]) > 0)[0]
    top_edge = np.where(np.array([np.sum(image_bounding[i, :]) for i in range(300)]) > 0)[0]
    bottom_edge = np.where(np.array([np.sum(image_bounding[i, :]) for i in range(300)]) > 0)[0]
    if len(left_edge) > 0:
        left_edge = left_edge[0]
    else:
        left_edge = 0
    if len(right_edge) > 0:
        right_edge = right_edge[-1]
    else:
        right_edge = 300
    if len(top_edge) > 0:
        top_edge = top_edge[0]
    else:
        top_edge = 0
    if len(bottom_edge) > 0:
        bottom_edge = bottom_edge[-1]
    else:
        bottom_edge = 300
    return left_edge, right_edge, top_edge, bottom_edge

In [22]:
test_size = 0.2
dataset = np.load(os.path.join(preceding_path, 'data_train.npy'))
labels = np.load(os.path.join(preceding_path, 't_train_corrected.npy'))
if os.path.exists(os.path.join(preceding_path, folder_name, "train")):
    shutil.rmtree(os.path.join(preceding_path, folder_name, "train"))
if os.path.exists(os.path.join(preceding_path, folder_name, "val")):
    shutil.rmtree(os.path.join(preceding_path, folder_name, "val"))
os.makedirs(os.path.join(preceding_path, folder_name, "train", "images"))
os.makedirs(os.path.join(preceding_path, folder_name, "train", "labels"))
os.makedirs(os.path.join(preceding_path, folder_name, "val", "images"))
os.makedirs(os.path.join(preceding_path, folder_name, "val", "labels"))

train_indices2, test_indices = train_test_split(range(9032), test_size=test_size, shuffle=True)
train_indices, val_indices = train_test_split(train_indices2, test_size=test_size, shuffle=True)

np.save('data_test.npy', np.array([dataset[:, idx] for idx in test_indices]).T)
np.save('t_test.npy', np.array([labels[idx] for idx in test_indices]))
dataset = dataset.reshape((300, 300, 9032))

for i in range(10):
    for idx in np.where(labels == i)[0]:
        if idx in train_indices:
            generateImageAndLabel("train", i, idx)
        elif idx in val_indices:
            generateImageAndLabel("val", i, idx)

for idx in np.where(labels == -1)[0]:
    if idx in train_indices:
        generateImageAndLabel("train", 10, idx)
    elif idx in val_indices:
        generateImageAndLabel("val", 10, idx)
        
for i in range(300):
    generateAugmentedData(i, "train")
for i in range(300):
    generateAugmentedData(i, "val")