In [1]:
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt

kaggle_dir = "kaggle dogs and cats"
kaggle_categories = ["dog", "cat"] 
img_dimension = 80

In [2]:
def img_resize_square(resizing_array, dimension):
    # find smallest dimesion to crop the image to a square
    smallest_dimention = np.min(resizing_array.shape)

    # origin is at top left
    # find top left coordinate of cropped image
    height_start = int((resizing_array.shape[0] - smallest_dimention) / 2)
    width_start = int((resizing_array.shape[1] - smallest_dimention) / 2)

    # find bottom right coordinate of cropped image
    height_end = int(height_start + smallest_dimention)
    width_end = int(width_start + smallest_dimention)

    # center crop the image 
    cropped_array = resizing_array[height_start:height_end, width_start:width_end]
    
    # resize the image to correct dimensions
    final_img_array = cv2.resize(cropped_array, (dimension, dimension))
    
    return final_img_array

In [3]:
training_data = []

def create_training_data(data_dir, categories):
    # goes through each category
    for category in categories:
        # path to each category
        path = os.path.join(data_dir, category)
        # defining image labels for each category
        img_label = categories.index(category)
        
        # going through every image in each category folder
        for img in os.listdir(path):
            
            try:
                # store the image pixel values into an array as a grayscale image
                img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
                # resize the image to the desired dimensions
                img_array = img_resize_square(img_array, img_dimension)
                # add the image array and label to the training data list
                training_data.append([img_array, img_label])
                
            except Exception as e:
                pass

create_training_data(kaggle_dir, kaggle_categories)

In [4]:
print(len(training_data))

24946


In [5]:
import random 

random.shuffle(training_data)

In [6]:
for sample in training_data[:10]:
    print(sample[1])
    

0
0
1
1
0
0
1
0
0
1


In [7]:
x = []
y = []

# split the labels and images in training data array into separate arrays
for images, labels in training_data:
    x.append(images)
    y.append(labels)
    
training_images = np.array(x).reshape(-1, img_dimension, img_dimension, 1)
training_labels = y

In [8]:
import pickle

# saving training image data using pickle
pickle_out = open("training_images.pickle", "wb")
pickle.dump(training_images, pickle_out)
pickle_out.close()

# saving training label data using pickle
pickle_out = open("training_labels.pickle", "wb")
pickle.dump(training_labels, pickle_out)
pickle_out.close()

In [9]:
# load the data and display a sample
pickle_in = open("training_labels.pickle", "rb")
loaded_training_labels = pickle.load(pickle_in)

loaded_training_labels[1]

0