## Dog Breed Classification Notebook
### Abe Eyman Casey & Sameer Patel
##### This notebook will take 5+ hours to run. Do not run unless absolutely necessary

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from mpl_toolkits.axes_grid1 import ImageGrid
import keras
warnings.filterwarnings('ignore')

In [2]:
os.getcwd()

'/mnt/d/Denver/new_dog_breeder/classifier_app'

In [3]:
train_path = 'data/train'
train_size = len(os.listdir(train_path))
print("Number of pictures: %s" % train_size)

Number of pictures: 10222


In [4]:
labels_df = pd.read_csv('data/labels.csv')
labels_df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [5]:
all_breeds = labels_df.sort_values(by = "breed").filter(['breed']).drop_duplicates().reset_index(drop = True)
print('Number of dog breeds: %s' % len(all_breeds.breed))
pd.set_option('display.min_rows', 120)
pd.set_option('display.max_rows', 120)
display(all_breeds)

Number of dog breeds: 120


Unnamed: 0,breed
0,affenpinscher
1,afghan_hound
2,african_hunting_dog
3,airedale
4,american_staffordshire_terrier
5,appenzeller
6,australian_terrier
7,basenji
8,basset
9,beagle


### Data PreProcessing

In [6]:
train_datagen=ImageDataGenerator(rescale=1./255.,
                                  horizontal_flip = True,
                                  validation_split=0.02 
                                  )

In [7]:
img_size = (331, 331)
def append_ext(fn):
    return fn+".jpg"

all_df = labels_df.copy()
all_df['id'] = all_df['id'].apply(append_ext)
train_df = all_df.sample(frac = .85, random_state = 13)
test_df = all_df[~all_df.isin(train_df)].dropna()

In [8]:
train_df.shape

(8689, 2)

In [9]:
test_df.shape

(1533, 2)

In [11]:
### Splitting training into train/validation

In [10]:
train_generator=train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = "id",
    y_col = "breed",
    subset = "training",
    seed = 13,
    shuffle = True,
    class_mode = "categorical",
    target_size = img_size,
    color_mode = "rgb" 
)

Found 8516 validated image filenames belonging to 120 classes.


In [11]:
train_x, train_y = next(train_generator)

In [12]:
valid_generator=train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = "id",
    y_col = "breed",
    subset = "validation",
    seed = 13,
    shuffle = True,
    class_mode="categorical",
    target_size = img_size,
    color_mode = "rgb"
)

Found 173 validated image filenames belonging to 120 classes.


In [13]:
test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator=test_datagen.flow_from_dataframe(
    dataframe = test_df,
    directory = train_path,
    x_col="id",
    y_col = None,
    seed = 13,
    shuffle = False,
    class_mode = None,
    target_size = img_size,
    color_mode="rgb"
)

Found 1533 validated image filenames.


In [14]:
shape = (331, 331, 3)

#### Loading NASNet Large
###### Takes about 10 minutes

In [15]:
pretrained_model = tf.keras.applications.NASNetLarge(
        weights='imagenet',
        include_top = False ,
        input_shape = shape
    )

In [16]:
pretrained_model.trainable = False

In [17]:
model = tf.keras.Sequential([ 
        pretrained_model,   
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(120, activation='softmax')
    ])

In [18]:
opt=tf.keras.optimizers.SGD(lr=1e-3, momentum=0.9)
model.compile(optimizer = opt ,
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
NASNet (Functional)          (None, 11, 11, 4032)      84916818  
_________________________________________________________________
global_average_pooling2d (Gl (None, 4032)              0         
_________________________________________________________________
dense (Dense)                (None, 120)               483960    
Total params: 85,400,778
Trainable params: 483,960
Non-trainable params: 84,916,818
_________________________________________________________________


In [19]:
early = tf.keras.callbacks.EarlyStopping(patience=2,
                                         min_delta=0.001,
                                         restore_best_weights=True)

In [20]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

### Model Fitting 
##### (THIS COULD TAKE AN HOUR PLUS)

In [21]:
%%time
history = model.fit(train_generator,
                    steps_per_epoch = STEP_SIZE_TRAIN,
                    validation_data = valid_generator,
                    validation_steps = STEP_SIZE_VALID,
                    epochs = 8,
                    callbacks = [early])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 19h 17min 28s, sys: 13h 25min 32s, total: 1d 8h 43min
Wall time: 7h 13min 11s


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
NASNet (Functional)          (None, 11, 11, 4032)      84916818  
_________________________________________________________________
global_average_pooling2d (Gl (None, 4032)              0         
_________________________________________________________________
dense (Dense)                (None, 120)               483960    
Total params: 85,400,778
Trainable params: 483,960
Non-trainable params: 84,916,818
_________________________________________________________________


In [23]:
tf.keras.models.save_model(model, filepath = "data/updated_model")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: data/updated_model/assets


In [24]:
%%time
reconstructed_model = tf.keras.models.load_model("data/updated_model")

CPU times: user 1min 35s, sys: 1.05 s, total: 1min 36s
Wall time: 1min 34s


#### Evaluation

In [25]:
%%time
score = model.evaluate(valid_generator)
print("Accuracy: {:.2f}%".format(score[1] * 100)) 
print("Loss: ", score[0])

Accuracy: 91.91%
Loss:  0.34446975588798523
CPU times: user 2min 53s, sys: 1min 56s, total: 4min 50s
Wall time: 1min 4s


### Predicting on test images

In [26]:
%%time
pred = reconstructed_model.predict(test_generator)

CPU times: user 25min 39s, sys: 18min 22s, total: 44min 2s
Wall time: 9min 49s


In [27]:
pred.shape

(1533, 120)

In [28]:
test_df.head()

Unnamed: 0,id,breed
2,001cdf01b096e06d78e9e5112d419397.jpg,pekinese
21,008ba178d6dfc1a583617470d19c1673.jpg,otterhound
36,00fda6ecca54efbac26e907be4b0b78b.jpg,giant_schnauzer
39,010e87fdf252645a827e37470e65e842.jpg,groenendael
50,014c2b0cd8e3b517e649cecf8543b8fe.jpg,african_hunting_dog


In [29]:
predicted_df = pd.DataFrame(pred, columns = all_breeds.breed)
predicted_df.head()

breed,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,beagle,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,0.004654,0.002905,0.002093,0.000734,0.00223,0.003215,0.001048,0.001772,0.00271,0.001775,...,0.005485,0.015114,0.002484,0.002059,0.00152,0.000981,0.004377,0.003805,0.001316,0.005994
1,0.000417,0.00067,0.001024,0.001159,0.000384,0.000477,0.000397,0.000333,0.000883,0.000663,...,0.000213,0.000259,0.001115,0.000734,0.000687,0.000723,0.000354,0.000434,0.001688,0.000428
2,0.002084,0.000717,0.000897,0.001417,0.000708,0.00096,0.000843,0.001704,0.000818,0.000735,...,0.000628,0.000334,0.001566,0.00102,0.001383,0.000628,0.000828,0.001346,0.001615,0.000514
3,0.00157,0.002137,0.001445,0.001017,0.001457,0.000967,0.001483,0.00118,0.000858,0.001791,...,0.004568,0.001462,0.001513,0.001309,0.000958,0.001141,0.001486,0.001474,0.002143,0.001652
4,0.000755,0.000578,0.898768,0.000955,0.000978,0.001004,0.000926,0.000676,0.000874,0.00067,...,0.001413,0.000547,0.001162,0.000731,0.000803,0.000919,0.000584,0.001011,0.000872,0.000912


In [30]:
final_preds = predicted_df.idxmax(axis=1)
final_preds[1:10]

1                 otterhound
2            giant_schnauzer
3                groenendael
4        african_hunting_dog
5    wire-haired_fox_terrier
6                 schipperke
7                     kelpie
8                 pomeranian
9                entlebucher
dtype: object

In [31]:
final_pred_df = test_df.copy()
final_pred_df.reset_index(drop = True, inplace = True)
final_pred_df['predicted_breed'] = final_preds
final_pred_df.head()

Unnamed: 0,id,breed,predicted_breed
0,001cdf01b096e06d78e9e5112d419397.jpg,pekinese,pekinese
1,008ba178d6dfc1a583617470d19c1673.jpg,otterhound,otterhound
2,00fda6ecca54efbac26e907be4b0b78b.jpg,giant_schnauzer,giant_schnauzer
3,010e87fdf252645a827e37470e65e842.jpg,groenendael,groenendael
4,014c2b0cd8e3b517e649cecf8543b8fe.jpg,african_hunting_dog,african_hunting_dog


In [32]:
misclassified_tot = final_pred_df.query("breed != predicted_breed").shape[0]
correct_tot = final_pred_df.query("breed == predicted_breed").shape[0]
total_attempted = final_pred_df.shape[0]
print(f"Correctly classified {correct_tot}/{total_attempted} dog breeds leaving {misclassified_tot} misclassified.")

Correctly classified 1440/1533 dog breeds leaving 93 misclassified.


In [33]:
train_df.to_csv('data/preprocessed_saves/our_train.csv')
test_df.to_csv('data/preprocessed_saves/our_test.csv')