# Dogs vs. Cats analysis with Convolutional Neural Network

#### The Dogs/Cats set is separated into a training set and a test set. The training set is used here to build the CNN.

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization

#### We first define the image properties (size and RGB color channels).

In [2]:
img_height = 120
img_width = 120
img_size = (img_height, img_width)
channels = 3

#### Select the data directories and separate them based on dogs and cats.

In [3]:
def create_img_df(train_dir, img_height=img_height, img_width=img_width):
    labels = []
    for img_path in train_dir:
        if img_path.split('.')[0] == 'cat':
            labels.append(0)
        else:
            labels.append(1)
    img_df = pd.DataFrame({'files' : train_dir,
                           'label' : labels})
    return img_df

#### Keep only the 1000 cat and dog images to make training faster.

In [4]:
def create_img_df_trunc(train_dir, img_height=img_height, img_width=img_width):
    labels = []
    train_dir_trunc = []
    cat_count = 0
    dog_count = 0
    for img_path in train_dir:
        if img_path.split('.')[0] == 'cat':
            if cat_count == 1000:
                continue
            else:
                train_dir_trunc.append(img_path)
                labels.append(0)
                cat_count += 1
        elif img_path.split('.')[0] == 'dog':
            if dog_count == 1000:
                continue
            else:
                train_dir_trunc.append(img_path)
                labels.append(1)
                dog_count += 1
        elif dog_count == 1000 and cat_count == 1000:
            break
    img_df = pd.DataFrame({'files' : train_dir_trunc,
                           'label' : labels})
    return img_df

#### Tried both methods and measured accuracy. ```create_img_df``` is more accurate, but takes far longer.

In [5]:
train_dir = os.listdir('./train')
img_df = create_img_df_trunc(train_dir)
print(img_df)

              files  label
0         cat.0.jpg      0
1         cat.1.jpg      0
2        cat.10.jpg      0
3       cat.100.jpg      0
4      cat.1000.jpg      0
...             ...    ...
1995  dog.10893.jpg      1
1996  dog.10894.jpg      1
1997  dog.10895.jpg      1
1998  dog.10896.jpg      1
1999  dog.10897.jpg      1

[2000 rows x 2 columns]


#### Now, we construct the CNN model and compile it. It consists of 3 convolutional layers that extract features from an image.

In [6]:
#We have 2 classes each image can be, cat or dog
num_classes = 2
droupout = 0.35

cat_dog_model = Sequential()

cat_dog_model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(img_width, img_height, channels)))
cat_dog_model.add(BatchNormalization())
cat_dog_model.add(MaxPooling2D(pool_size=(2, 2)))
cat_dog_model.add(Dropout(droupout))

cat_dog_model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cat_dog_model.add(BatchNormalization())
cat_dog_model.add(MaxPooling2D(pool_size=(2, 2)))
cat_dog_model.add(Dropout(droupout))

cat_dog_model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cat_dog_model.add(BatchNormalization())
cat_dog_model.add(MaxPooling2D(pool_size=(2, 2)))
cat_dog_model.add(Dropout(droupout))

cat_dog_model.add(Flatten())
cat_dog_model.add(Dense(256, activation='relu'))
cat_dog_model.add(BatchNormalization())
cat_dog_model.add(Dropout(droupout))
cat_dog_model.add(Dense(num_classes, activation='softmax'))

cat_dog_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#### Set up callbacks, i.e., conditions on the model to prevent overfitting.

In [7]:
early_stopping = EarlyStopping(patience=5) #stop after 5 epochs if the loss value has not decreased.
LR_reduction = ReduceLROnPlateau( #reduce the learning rate if the accuracy does not increase
    monitor='val_accuracy',
    patience=2, #start doing this on the second epoch
    factor=0.5,
    min_lr=1e-4
)
callbacks = [early_stopping, LR_reduction]

#### Split data frame into training and validation data and prepare data generators.

In [8]:
img_df['label'] = img_df['label'].replace({0:'cat', 1:'dog'})

train_df, valid_df = train_test_split(img_df, train_size=0.8, test_size=0.2, random_state=42)

batch_size = 32

train_data_generator = ImageDataGenerator(
    rotation_range=42,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
    fill_mode='nearest'
)
train_generator = train_data_generator.flow_from_dataframe(
    train_df,
    directory='./train',
    x_col='files', y_col='label',
    target_size=img_size,
    class_mode='categorical',
    batch_size=batch_size
)

valid_data_generator = ImageDataGenerator(rescale=1./255)
validation_generator = valid_data_generator.flow_from_dataframe(
    valid_df,
    directory='./train',
    x_col='files', y_col='label',
    target_size=img_size,
    class_mode='categorical',
    batch_size=batch_size
)

total_train = train_df.shape[0]
total_valid = valid_df.shape[0]

Found 1600 validated image filenames belonging to 2 classes.
Found 400 validated image filenames belonging to 2 classes.


#### Fit the model.

In [9]:
hist = cat_dog_model.fit(
    train_generator,
    steps_per_epoch=total_train//batch_size,
    epochs=8,
    validation_data=validation_generator,
    validation_steps=total_valid//batch_size,
    callbacks=callbacks
)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


#### Saving the model weights.

In [10]:
cat_dog_model.save_weights('cat_dog_model_8epoch_full.h5')

In [11]:
test_dir = os.listdir('./test1')
test_df = pd.DataFrame({
    'files':test_dir
})
num_samples = test_df.shape[0]

In [12]:
test_data_generator = ImageDataGenerator(rescale=1./255)
test_generator = test_data_generator.flow_from_dataframe(
    test_df,
    directory='./test1',
    x_col='files', y_col=None,
    target_size=img_size,
    class_mode=None,
    batch_size=batch_size
)

Found 12500 validated image filenames.


#### Make further predictions.

In [13]:
predict = cat_dog_model.predict(test_generator, steps=np.ceil(num_samples/batch_size))

In [17]:
test_df['label'] = np.argmax(predict, axis=-1)
submission = test_df.copy()
submission['id'] = submission['files'].str.split('.').str[0]
ids = submission['id']
submission = submission.drop(['files', 'id'], axis=1)
submission.insert(0, 'id', ids)
submission.to_csv('submission.csv', index=False)