In [28]:
import numpy as np
import random
import cv2
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split

NUM_OF_IMAGES = 12500
IMG_SIZE = 50
BASE_DIR = "./PetImages"

def label_image(type):
    if type == "Dog":
        return [0, 1]
    if type == "Cat":
        return [1, 0]
    raise Exception("Unknown Type")

def process_image(path):
    try:
        image = cv2.imread(path)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
        return image
    except Exception as e:
        return None

def process_data(img_nums, type, should_label):
    dataset = []
    base_dir = f'{BASE_DIR}/{type}'
    for num in tqdm(img_nums):
        path = os.path.join(base_dir, f'{num}.jpg')
        image = process_image(path)
        if image is not None:
            if should_label:
                dataset.append([np.array(image), label_image(type)])
            else:
                dataset.append([np.array(image), num])
    random.shuffle(dataset)
    return dataset

# Create image indices, each data set has same size
indices = np.arange(NUM_OF_IMAGES)
# Split indices for cats
train_cats_indices, test_cats_indices = train_test_split(indices, test_size=0.01, random_state=42)
# Split indices for dogs
train_dogs_indices, test_dogs_indices = train_test_split(indices, test_size=0.01, random_state=42)

# Process training data
cats_train_data = process_data(train_cats_indices, "Cat", True)
dogs_train_data = process_data(train_dogs_indices, "Dog", True)

# Convert to numpy arrays
cats_train_data = np.asarray(cats_train_data, dtype="object")
dogs_train_data = np.asarray(dogs_train_data, dtype="object")

# Combine and shuffle train data
train_data = np.concatenate((cats_train_data, dogs_train_data), axis=0)
np.random.shuffle(train_data)

# Save train data
np.save('train_data.npy', train_data)

# Process testing data
cats_test_data = process_data(test_cats_indices, "Cat", False)
dogs_test_data = process_data(test_dogs_indices, "Dog", False)

# Convert to numpy arrays
cats_test_data = np.asarray(cats_test_data, dtype="object")
dogs_test_data = np.asarray(dogs_test_data, dtype="object")

# Combine and shuffle train data
test_data = np.concatenate((cats_test_data, dogs_test_data), axis=0)
np.random.shuffle(test_data)

# Save train data
np.save('test_data.npy', test_data)

 12%|████████████████▏                                                                                                                        | 1461/12375 [00:01<00:09, 1209.24it/s]Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
 13%|█████████████████▌                                                                                                                       | 1589/12375 [00:01<00:08, 1226.53it/s]Corrupt JPEG data: 214 extraneous bytes before marker 0xd9
 15%|████████████████████▎                                                                                                                    | 1835/12375 [00:01<00:08, 1221.73it/s]Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
 21%|████████████████████████████▊                                                                                                            | 2604/12375 [00:02<00:07, 1263.25it/s]Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
 62%|███████████████████████████████████