# Load the data and control if balanced

In [1]:
%matplotlib inline
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

In [19]:
train_dir = 'train'
test_dir = 'test1'

In [20]:
train_pics = os.listdir(train_dir)
test_pics = os.listdir(test_dir)

In [4]:
len(train_pics)

25000

In [5]:
dogs_train = 0
cats_train = 0

In [6]:
for pic in train_pics:
    if 'dog' in pic:
        dogs_train += 1
    elif 'cat' in pic:
        cats_train += 1

In [7]:
print(f'Dogs: {dogs_train}; Cats: {cats_train}')

Dogs: 12500; Cats: 12500


# Preprocess the data

Labels: Dogs = 0; Cats = 1

In [8]:
def preprocess_train(folder, size):
    
    pictures = os.listdir(folder)  # folder with the pictures
    
    features = []
    labels = []
    
    for pic in pictures:
        img_path = os.path.join(folder, pic)
        img = cv2.imread(img_path)
        img_resized = cv2.resize(img, (size, size))  # resize the image to the same size
        img_array = np.asarray(img_resized)
        img_scaled = (img_array/255.0).astype('float32')
        features.append(img_array)
                
        if 'cat' in pic:
            labels.append(1)
        elif 'dog' in pic:
            labels.append(0)

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True)
    
    X_train_array = np.array(X_train)
    X_test_array = np.array(X_test)
    y_train_array = np.array(y_train)
    y_test_array = np.array(y_test)
    
    return X_train_array, X_test_array, y_train_array, y_test_array

In [9]:
X_train, X_test, y_train, y_test = preprocess_train(folder=train_dir, size=200)

In [21]:
def preprocess_test(folder, size):
    
    pictures = os.listdir(folder)
    
    features = []
    
    for pic in pictures:
        img_path = os.path.join(folder, pic)
        img = cv2.imread(img_path)
        img_resized = cv2.resize(img, (size, size))  # resize the image to the same size
        img_array = np.asarray(img_resized)
        img_scaled = (img_array/255.0).astype('float32')
        features.append(img_array)
    
    testdata = np.array(features)
    
    return testdata

In [22]:
testdata = preprocess_test(folder=test_dir, size=200)

# Save the data

In [16]:
def save_train():
    path_train_scaled = '/home/olli/Projects/Exercises/Dogs_Cats/Train_Scaled'
    
    if not os.path.exists(path_train_scaled):
        os.mkdir(path_train_scaled)
    
    X_train_scaled_path = os.path.join(path_train_scaled, 'X_train.pickle')
    X_test_scaled_path = os.path.join(path_train_scaled, 'X_test.pickle')
    y_train_scaled_path = os.path.join(path_train_scaled, 'y_train.pickle')
    y_test_scaled_path = os.path.join(path_train_scaled, 'y_test.pickle')
    
    list_path = [X_train_scaled_path, X_test_scaled_path, y_train_scaled_path, y_test_scaled_path]
    list_data = [X_train, X_test, y_train, y_test]
    
    for path, data in zip(list_path, list_data):
    
        if os.path.exists(path):  # if exists: data allready there
            continue
        
        with open(path, 'wb') as f:
            pickle.dump(data, f)
    
    print(os.listdir(path_train_scaled))

In [17]:
save_train()

['X_test', 'y_test', 'X_train', 'y_train']


In [36]:
def save_test():
    path_test_scaled = '/home/olli/Projects/Exercises/Dogs_Cats/Test_Scaled'
    
    if not os.path.exists(path_test_scaled):
        os.mkdir(path_test_scaled)
        
        test_path = os.path.join(path_test_scaled, 'testdata.pickle')
        
        with open(test_path, 'wb') as f:
            pickle.dump(testdata, f)
    
    print(os.listdir(path_test_scaled))    

In [37]:
save_test()

['testdata.pickle']
