## Convolutional Neural Network - Data preparation 
> We are going to download `kaggle` datasets `cats` vrs `dogs` and we will then prepare our data to create a Conv NN to classify images of cats and dogs.

### Import Helper Libraries

In [1]:
import pandas as pd
import numpy as np
import imageio
import cv2
from tqdm import tqdm
import os
from zipfile import ZipFile

### Extracting the Zip File

In [None]:
with ZipFile('dogs-vs-cats.zip', 'r') as z:
    tqdm(z.extractall())

print("Extracted")

with ZipFile('train.zip', 'r') as z:
    tqdm(z.extractall())
    print("Extracted")

In [None]:
with ZipFile('test1.zip', 'r') as z:
    tqdm(z.extractall())
    
print("Extracted")

### Grouping our images

In [2]:
train_categories = []
test_categories = []

for filename in os.listdir("train"):
    if filename.split('.')[0] == 'dog':
        train_categories.append([filename, 1])
    else:
         train_categories.append([filename, 0])

for filename in os.listdir("test1"):
    if filename.split('.')[0] == 'dog':
        test_categories.append([filename, 1])
    else:
         test_categories.append([filename, 0])
print("Done")

Done


In [3]:
train_categories[:10]

[['cat.0.jpg', 0],
 ['cat.1.jpg', 0],
 ['cat.10.jpg', 0],
 ['cat.100.jpg', 0],
 ['cat.1000.jpg', 0],
 ['cat.10000.jpg', 0],
 ['cat.10001.jpg', 0],
 ['cat.10002.jpg', 0],
 ['cat.10003.jpg', 0],
 ['cat.10004.jpg', 0]]

> Now we have our file names. We want to convert these images into numpy arrays and resize them to have the same shape.

In [4]:
from matplotlib import pyplot as plt

In [5]:
train_data = []

for filename, label in train_categories:
    image = cv2.imread(os.path.join('train', filename))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (50, 50))
    ## Sacling the image down
    
    if filename.split('.')[0] == 'cat':
        train_data.append([image, np.eye(2)[0]])
    elif filename.split('.')[0] == 'dog':
        train_data.append([image, np.eye(2)[1]])
    else:
        pass

In [6]:
### Shuffle the data
train_data_shuffled = np.array(train_data)
np.random.shuffle(train_data_shuffled)

In [11]:
train_data_shuffled[:1]

array([[array([[227, 225, 232, ..., 196, 196, 197],
       [231, 235, 227, ..., 196, 196, 196],
       [226, 220, 219, ..., 196, 196, 196],
       ...,
       [160, 152, 150, ...,  64,  40,  44],
       [155, 156, 158, ...,  34,  36,  46],
       [159, 162, 156, ...,  31,  44,  54]], dtype=uint8),
        array([1., 0.])]], dtype=object)

#### Saving The Train Data as a NP array

In [13]:
np.save('dog_cats_train.npy', train_data_shuffled )
print("Saved!!")

Saved!!


In [15]:
data = np.load('dog_cats_train.npy', allow_pickle=True)
data[:1]

array([[array([[227, 225, 232, ..., 196, 196, 197],
       [231, 235, 227, ..., 196, 196, 196],
       [226, 220, 219, ..., 196, 196, 196],
       ...,
       [160, 152, 150, ...,  64,  40,  44],
       [155, 156, 158, ...,  34,  36,  46],
       [159, 162, 156, ...,  31,  44,  54]], dtype=uint8),
        array([1., 0.])]], dtype=object)

In [10]:
test_data = []

for filename, label in test_categories:
    image = cv2.imread(os.path.join('test1', filename))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (50, 50))
    if filename.split('.')[0] == 'cat':
        test_data.append([image, np.eye(2)[0]])
    elif filename.split('.')[0] == 'dog':
        test_data.append([image, np.eye(2)[1]])
    else:
        pass

In [16]:
### Shuffle the data
test_data_shuffled = np.array(test_data)
np.random.shuffle(test_data_shuffled)

In [17]:
np.save('dog_cats_test.npy', test_data_shuffled )
print("Saved!!")

Saved!!
