In [None]:
# Imports
from google.colab import files
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import Xception
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
# Prompt to upload Kaggle credentials to download data from Kaggle
files.upload()

In [None]:
# Downloading dataset from Kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
# chmod 600 -> read + write permissions
!chmod 600 ~/.kaggle/kaggle.json

!mkdir dog_dataset
%cd dog_dataset
!kaggle datasets download catherinehorng/dogbreedidfromcomp
%cd ..

In [None]:
# Unzip data and remove zip folder
!unzip -q dog_dataset/dogbreedidfromcomp.zip -d dog_dataset
!rm dog_dataset/dogbreedidfromcomp.zip

In [None]:
# Add .jpg extension to all ids
labels = pd.read_csv('dog_dataset/labels.csv')
labels["id"] = labels["id"].apply(lambda x: x + ".jpg")

In [None]:
# OPTIONAL: Check to see if dataset is balanced or skewed
# This is a widely used dataset from Kaggle, it should be good. Can double check just to be sure.

sns.set_theme()
plt.figure(figsize=(20,6))
sns.countplot(data=labels, x='breed')
plt.xticks(rotation=90)
plt.title('Dog Breed Distribution')
plt.show()

In [None]:
# Create train, validation, and test splits with ~80 10 10 ratio

# seed for reproducible results
RANDOM_STATE = 42 # Answer to life, the universe, and everything

#r_state = random.randint(0, 99)
train_df, test_df = train_test_split(labels, test_size=0.1, random_state=RANDOM_STATE)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=RANDOM_STATE)

In [None]:
# Make sure the sizes are compact enough, Colab free tier does NOT give me a GPU to analyze images at 4K

image_sizes = [Image.open(f'dog_dataset/train/{img_id}').size for img_id in train_df['id']]

widths = [size[0] for size in image_sizes]
heights = [size[1] for size in image_sizes]

w = np.array(widths)
h = np.array(heights)

figure, axis = plt.subplots(1, 2)

axis[0].hist(w)
axis[0].set_title('Widths Distribution')

axis[1].hist(h)
axis[1].set_title('Heights Distribution')
plt.show()

In [None]:
# Dimensions picked based on width and height distribution from previous cell
SIZE = (350, 350)


NUM_CLASSES = len(set(labels['breed']))

# Initial hyperparamters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT_RATE = 0.7
EPOCHS = 50

In [None]:
# Re-scale image values to be between [0, 1]
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

# Can try rescale=((1./127.5)-1) for a range of [-1, 1] later on?

In [None]:
# Data generators
train_generator = train_datagen.flow_from_dataframe(train_df, 'dog_dataset/train', 'id', 'breed', target_size=SIZE, batch_size=BATCH_SIZE, class_mode='categorical')
test_generator = test_datagen.flow_from_dataframe(test_df, 'dog_dataset/train', 'id', 'breed', target_size=SIZE, batch_size=BATCH_SIZE, class_mode='categorical')
val_generator = val_datagen.flow_from_dataframe(val_df, 'dog_dataset/train', 'id', 'breed', target_size=SIZE, batch_size=BATCH_SIZE, class_mode='categorical')

In [None]:
# Base model
input_tensor = Input(shape=(SIZE[0], SIZE[1], 3))
base_model = Xception(weights='imagenet', include_top=False, input_tensor=input_tensor)
base_model.trainable = False

In [None]:
# Output layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(DROPOUT_RATE)(x)
output = Dense(NUM_CLASSES, activation='softmax')(x)

# Compile model
model = Model(inputs=input_tensor, outputs=output)
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
model_checkpoint = ModelCheckpoint('model.keras', monitor='val_loss', save_best_only=True, verbose=1)

In [None]:
# Train model
history = model.fit(train_generator,
                    validation_data=val_generator,
                    steps_per_epoch=train_generator.samples//BATCH_SIZE,
                    validation_steps=val_generator.samples//BATCH_SIZE,
                    epochs=EPOCHS,
                    callbacks=[early_stopping, model_checkpoint])

#  "Accuracy"
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# Evaluate model on test data
score = model.evaluate(test_generator)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
model.save('final_model.keras')