## Dog Breed Classification Notebook
### Abe Eyman Casey & Sameer Patel
##### This notebook will take 5+ hours to run. Do not run unless absolutely necessary

In [65]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from mpl_toolkits.axes_grid1 import ImageGrid
import keras
warnings.filterwarnings('ignore')

In [2]:
os.getcwd()

'/mnt/d/Denver/final_quarter/capstone/DogBreedClassification'

In [3]:
train_path = 'data/train'
train_size = len(os.listdir(train_path))
print("Number of pictures: %s" % train_size)

Number of pictures: 10222


In [4]:
labels_df = pd.read_csv('data/labels.csv')
labels_df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [5]:
all_breeds = labels_df.sort_values(by = "breed").filter(['breed']).drop_duplicates().reset_index(drop = True)
print('Number of dog breeds: %s' % len(all_breeds.breed))
pd.set_option('display.min_rows', 120)
pd.set_option('display.max_rows', 120)
display(all_breeds)

Number of dog breeds: 120


Unnamed: 0,breed
0,affenpinscher
1,afghan_hound
2,african_hunting_dog
3,airedale
4,american_staffordshire_terrier
5,appenzeller
6,australian_terrier
7,basenji
8,basset
9,beagle


### Data PreProcessing

In [7]:
train_datagen=ImageDataGenerator(rescale=1./255.,
                                  horizontal_flip = True,
                                  validation_split=0.02 
                                  )

In [8]:
img_size = (331, 331)
def append_ext(fn):
    return fn+".jpg"

all_df = labels_df.copy()
all_df['id'] = all_df['id'].apply(append_ext)
train_df = all_df.sample(frac = .8, random_state = 13)
test_df = all_df[~all_df.isin(train_df)].dropna()

In [9]:
train_df.shape

(8178, 2)

In [10]:
test_df.shape

(2044, 2)

In [11]:
### Splitting training into train/validation

In [12]:
train_generator=train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = "id",
    y_col = "breed",
    subset = "training",
    seed = 13,
    shuffle = True,
    class_mode = "categorical",
    target_size = img_size,
    color_mode = "rgb" 
)

Found 8015 validated image filenames belonging to 120 classes.


In [13]:
train_x, train_y = next(train_generator)

In [16]:
valid_generator=train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = "id",
    y_col = "breed",
    subset = "validation",
    seed = 13,
    shuffle = True,
    class_mode="categorical",
    target_size = img_size,
    color_mode = "rgb"
)

Found 163 validated image filenames belonging to 120 classes.


In [17]:
test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator=test_datagen.flow_from_dataframe(
    dataframe = test_df,
    directory = train_path,
    x_col="id",
    y_col = None,
    seed = 13,
    shuffle = False,
    class_mode = None,
    target_size = img_size,
    color_mode="rgb"
)

Found 2044 validated image filenames.


In [18]:
shape = (331, 331, 3)

#### Loading NASNet Large
###### Takes about 10 minutes

In [19]:
pretrained_model = tf.keras.applications.NASNetLarge(
        weights='imagenet',
        include_top = False ,
        input_shape = shape
    )

In [20]:
pretrained_model.trainable = False

In [21]:
model = tf.keras.Sequential([ 
        pretrained_model,   
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(120, activation='softmax')
    ])

In [22]:
opt=tf.keras.optimizers.SGD(lr=1e-3, momentum=0.9)
model.compile(optimizer = opt ,
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
NASNet (Functional)          (None, 11, 11, 4032)      84916818  
_________________________________________________________________
global_average_pooling2d (Gl (None, 4032)              0         
_________________________________________________________________
dense (Dense)                (None, 120)               483960    
Total params: 85,400,778
Trainable params: 483,960
Non-trainable params: 84,916,818
_________________________________________________________________


In [23]:
early = tf.keras.callbacks.EarlyStopping(patience=2,
                                         min_delta=0.001,
                                         restore_best_weights=True)

In [24]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

### Model Fitting 
##### (THIS COULD TAKE AN HOUR PLUS)

In [25]:
%%time
history = model.fit(train_generator,
                    steps_per_epoch = STEP_SIZE_TRAIN,
                    validation_data = valid_generator,
                    validation_steps = STEP_SIZE_VALID,
                    epochs = 5,
                    callbacks = [early])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 11h 31min 42s, sys: 9h 25min 34s, total: 20h 57min 16s
Wall time: 4h 42min 26s


In [63]:
model.save('data/trained_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: data/trained_model/assets


In [66]:
reconstructed_model = keras.models.load_model("data/trained_model")

#### Evaluation

In [26]:
score = model.evaluate(valid_generator)
print("Accuracy: {:.2f}%".format(score[1] * 100)) 
print("Loss: ", score[0])

Accuracy: 92.64%
Loss:  0.5643866062164307


### Predicting on test images

In [27]:
%%time
pred = model.predict(test_generator)

CPU times: user 34min 45s, sys: 28min 9s, total: 1h 2min 55s
Wall time: 14min 7s


In [33]:
pred.shape

(2044, 120)

In [36]:
test_df.head()

Unnamed: 0,id,breed
2,001cdf01b096e06d78e9e5112d419397.jpg,pekinese
10,004396df1acd0f1247b740ca2b14616e.jpg,shetland_sheepdog
21,008ba178d6dfc1a583617470d19c1673.jpg,otterhound
36,00fda6ecca54efbac26e907be4b0b78b.jpg,giant_schnauzer
39,010e87fdf252645a827e37470e65e842.jpg,groenendael


In [69]:
predicted_df = pd.DataFrame(pred, columns = all_breeds.breed)
predicted_df.head()

breed,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,beagle,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,0.004197,0.006534,0.003837,0.002098,0.004739,0.003312,0.002676,0.005304,0.00197,0.002937,...,0.010476,0.016057,0.004408,0.002663,0.003131,0.002859,0.00977,0.004098,0.003049,0.011944
1,0.00226,0.002659,0.002142,0.003769,0.0029,0.002411,0.005332,0.004167,0.002633,0.003169,...,0.003907,0.003577,0.003075,0.003557,0.003635,0.00286,0.002474,0.003709,0.002692,0.002636
2,0.001054,0.003544,0.003084,0.003451,0.001087,0.001233,0.001211,0.000963,0.001427,0.001637,...,0.000912,0.000856,0.002033,0.003121,0.000987,0.000894,0.000782,0.000922,0.004867,0.000767
3,0.004214,0.002683,0.001646,0.004768,0.002115,0.003199,0.001861,0.005518,0.001515,0.001936,...,0.001959,0.001302,0.001647,0.001861,0.001505,0.0026,0.002262,0.002162,0.005202,0.001405
4,0.00494,0.003926,0.004086,0.003022,0.002438,0.001651,0.004846,0.004564,0.002899,0.002828,...,0.007762,0.003666,0.002154,0.003053,0.002312,0.002039,0.004106,0.002861,0.003046,0.002788


In [70]:
final_preds = predicted_df.idxmax(axis=1)
final_preds[1:10]

1          shetland_sheepdog
2                 otterhound
3            giant_schnauzer
4                groenendael
5        african_hunting_dog
6    wire-haired_fox_terrier
7                     basset
8                 schipperke
9                 bloodhound
dtype: object

In [78]:
final_pred_df = test_df.copy()
final_pred_df.reset_index(drop = True, inplace = True)
final_pred_df['predicted_breed'] = final_preds
final_pred_df.head()

Unnamed: 0,id,breed,predicted_breed
0,001cdf01b096e06d78e9e5112d419397.jpg,pekinese,pekinese
1,004396df1acd0f1247b740ca2b14616e.jpg,shetland_sheepdog,shetland_sheepdog
2,008ba178d6dfc1a583617470d19c1673.jpg,otterhound,otterhound
3,00fda6ecca54efbac26e907be4b0b78b.jpg,giant_schnauzer,giant_schnauzer
4,010e87fdf252645a827e37470e65e842.jpg,groenendael,groenendael


In [81]:
misclassified_tot = final_pred_df.query("breed != predicted_breed").shape[0]
correct_tot = final_pred_df.query("breed == predicted_breed").shape[0]
total_attempted = final_pred_df.shape[0]
print(f"Correctly classified {correct_tot}/{total_attempted} dog breeds leaving {misclassified_tot} misclassified.")

Correctly classified 1900/2044 dog breeds leaving 144 misclassified.


In [58]:
train_df.to_csv('data/preprocessed_saves/our_train.csv')
test_df.to_csv('data/preprocessed_saves/our_test.csv')