<b>Data Pre-processing including Removal of Duplicate images, SMOTE, Formatting image to 8x8x3</b>

In this code we will use the idea of taking horizontal gradient and creating hashes to find out duplicate images and removing them as a part of pre-processing.

[Reference Link](https://pyimagesearch.com/2017/11/27/image-hashing-opencv-python/)

In [1]:
#############  Remove Duplicates; Perform SMOTE to balance; Convert back to 8x8x3 images format; Save entire thing             


import pickle
import numpy as np
from imblearn.over_sampling import SMOTE
from numpy import load, savez_compressed
import gc
import cv2


def unpickle(file):
    with open(file, 'rb') as f:
        dict = pickle.load(f, encoding="latin1")
    return dict

data_FULL_training = unpickle(r'../input/trimagi/training_x.dat') #Loads all unpickled raw data in datadict variable
Label_FULL_training = unpickle(r'../input/trimagi/training_y.dat') #Loads all unpickled raw data in datadict variable

data_Validation = unpickle(r'../input/trimagi/validation_x.dat') #Loads all unpickled raw Test data in datadict_tst variable




def dhash(image, hashSize=8):
    # convert the image to grayscale and resize the grayscale image,
    # adding a single column (width) so we can compute the horizontal
    # gradient
    resized = cv2.resize(image, (hashSize + 1, hashSize))
    # compute the (relative) horizontal gradient between adjacent
    # column pixels
    diff = resized[:, 1:] > resized[:, :-1]
    # convert the difference image to a hash and return it
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

def Remove_Dupes(X_Data, Y_Data):
    cleanedX = []
    cleanedY = []
    hashes = {}

    # loop over our image paths
    count=0
    for image in X_Data:
        h = dhash(image)
        # grab all image paths with that hash, add the current image
        # path to it, and store the list back in the hashes dictionary
        p = hashes.get(h, [])
        p.append(count)
        hashes[h] = p
        count+=1

    for i in hashes:
        if(len(hashes[i]) == 1):
            cleanedX.append(X_Data[hashes[i]])
            cleanedY.append(Y_Data[hashes[i]])
    cleanedX = np.array(cleanedX)
    cleanedY = np.array(cleanedY)
    
    cleanedY = np.reshape(cleanedY,(cleanedY.shape[0]))
    cleanedX = np.reshape(cleanedX,(cleanedX.shape[0],X_Data.shape[1],X_Data.shape[2]))
    return cleanedX, cleanedY

    
def DATA_Processing(data_list,class_list,validation_list,Remove_dupes=0):  #This will create (preprocessed 8x8 imgs)

#========Remove unformatted data remove data with more than 8 rows or cols (img should be 8x8) and more than 3 channels
    remove_idx=[]
    for i in range(1,len(data_list)):
        if data_list[i].shape[0]!=8 or data_list[i].shape[1]!=8 or data_list[i].shape[2]!=3:
            #print(i)
            #print(data_list[i].shape)  
            remove_idx.append(i)

    for j in range(len(remove_idx)):    
        print("Removing value at index",remove_idx[j]) 
        #print(data_list[])
        remove=data_list.pop(remove_idx[j])
        remove=class_list.pop(remove_idx[j])


    data_list=np.array(data_list) 
    class_list=np.array(class_list)
    validation_list=np.array(validation_list)


#    #8x8 image FILTERED TO 64 BIT vector
    data_list=data_list[:,:,:,0]
    
    if Remove_dupes == 1:  
        data_list, class_list = Remove_Dupes(data_list, class_list) #Will Remove duplicate images from the dataset.

    data_list = data_list.reshape(data_list.shape[0], data_list.shape[1]*data_list.shape[2])

#    validation_list=validation_list[:,:,:,0]
#    validation_list = validation_list.reshape(validation_list.shape[0], validation_list.shape[1]*validation_list.shape[2])
           
    return data_list,class_list,validation_list

**Process data, Apply SMOTE and Reshape data by adding other two channels back**

In [3]:
#=== will create images of 64bit vector for training and 8x8x3 for val.
X_original,Y_original, VAL_original = DATA_Processing(data_FULL_training,Label_FULL_training, data_Validation,1)
print("Training Dataset's Dimensions:" ,X_original.shape,"class Dim: ", Y_original.shape)
Y_original = np.array(Y_original).astype("uint8") 

#=== Perform SMOTE on 64 valued vector
smote = SMOTE(sampling_strategy='auto')
X_original, Y_original = smote.fit_resample(X_original, Y_original)

print("Training Dataset's Dimensions After SMOTE:" ,X_original.shape,"class Dim: ", Y_original.shape)

#=== Rearrange back to image form.
X_original = X_original.reshape(X_original.shape[0], 8,8) 
X_original = np.stack((X_original,)*3, axis=-1)   #8x8x3 Copies one channel to other 2
print("Training Dataset's Dimensions After Formatting:" ,X_original.shape,"class Dim: ", Y_original.shape)
print()

gc.collect()

Training Dataset's Dimensions: (1266339, 64) class Dim:  (1266339,)
Training Dataset's Dimensions After SMOTE: (1329152, 64) class Dim:  (1329152,)
Training Dataset's Dimensions After Formatting: (1329152, 8, 8, 3) class Dim:  (1329152,)



0

**Save Compressed Numpy arrays**

In [None]:
##To Save file
savez_compressed('Processed_Y.npz', Y_original) #save all training labels (remove dupes; smote; dtype uint8; ))
savez_compressed('Processed_X.npz', X_original)  #save all training images (remove dupes; smote; 8x8x3; dtype uint8; ))
savez_compressed('Processed_Val.npz', VAL_original)  #save all Validation images (8x8x3; dtype uint8; ))