# Dog Breed Identification
## Farhad Navid 

### transfer learning with origenal data set

* Load data set
* Load weights from ImageNet features for pretrained VGG19 model "block4_pool".
* Run model
* Create train and test
* Run SVM model
* Record the performance. 

In [1]:
# initialization
import matplotlib.pyplot as plt 
import PIL
import pandas as pd
import os
import glob
import time
import numpy as np
import h5py
import tensorflow as tf
#import deepdish as dd

from array import *
from PIL import Image as Img

from tqdm import tqdm
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.layers import Input, Concatenate, Dense, Dropout, Flatten, Activation, GlobalMaxPooling2D
from keras.layers.convolutional import Conv2D, MaxPooling2D 

from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical
from keras import backend as K
from keras.optimizers import Adam

from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score,confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split


%matplotlib inline
K.set_image_dim_ordering( 'tf' )

Using TensorFlow backend.


This next block of code does represent the Directory structure of the data once the repository was selected.  In this instance the [**AWS**](https://www.AWS.Amazon.com/) (paid service) were utilized.  

In [2]:
# directory structure of the data

dpath_train = '/home/ubuntu/train'      # Data path to training data set
dpath_test  = '/home/ubuntu/test'       # Data path to test data set
dpath       = '/home/ubuntu'
label_f     = '/home/ubuntu/labels.csv' # Path to the label file 

### Load the pre-processed data

In [3]:
# now read to hdf5 
#The processed train data set file  that was created by input_prep file.
with h5py.File('train_data.hdf5','r') as f:
    X_train = f['X'][()] #the [()] means load all x_train data
    y_train = f['Y'][()] # Load all y_train data
print(X_train.shape,X_train.dtype,y_train.shape,y_train.dtype)

(10222, 224, 224, 3) uint8 (10222, 120) uint8


In [4]:
print(X_train.shape,X_train.dtype,y_train.shape,y_train.dtype)
y_train=y_train.astype('uint8')
#y_train[:5]

(10222, 224, 224, 3) uint8 (10222, 120) uint8


### y_train is one hot encoded and need to get it to integer form.

In [5]:
# converting the y_train from one hot encoding to integer
ytrain_1 = np.array(y_train)
ytrain1_rdy = [np.where(r==1)[0][0] for r in ytrain_1]

In [7]:
ytrain1_rdy[:1]

[19]

## Transfer Learning

In [8]:
# transfer learning
# Get the weights from imagenet for the VGG19 model
base_model = VGG19(weights='imagenet')
# now select the layer to get the features from in this case "block4_pool" was selected
model = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_pool').output)
                    
# Create a zero Numpy array with the shape of np.zero((Xs_train.shape[0],(block4_pool_features))) 
# for this modle the (image#,14,14,512)

train_set = np.zeros((X_train.shape[0],14,14,512))

## This section of code was written to findout the dimmention of the Block4_pool_features.

# x = preprocess_input(X_train[1])  # get one sample of X_train
# print(x.shape)                    # Chk the shape 
# x = np.expand_dims(x,axis=0)      # Ad the image num to the list
# print(x.shape)                    # Check the shape

# block4_pool_features_org = model.predict(x)  # Create one entry to see the shape

# this loop will fill the train_set numpy array (each x_train runs through model with extracted VGG features)
for i in tqdm(range(X_train.shape[0])):
#    K.clear_session()
    x = preprocess_input(X_train[i])
    x = np.expand_dims(x,axis=0)
    block4_pool_features = model.predict(x)
    train_set[i]=block4_pool_features
    

100%|██████████| 10222/10222 [05:07<00:00, 33.21it/s]



### Create Train and Test data set for SVM model

In [9]:
# Create train and test data set
train_set = train_set.reshape(train_set.shape[0],-1)
X= train_set
y= ytrain1_rdy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## SVM model
* **Model Fit**
* **Model Prediction**

In [10]:
# Run the svm 

t0=time.time()
clf = SVC()
clf.fit(X_train, y_train)
t1=time.time()
print(t1-t0,"seconds")

# predict 
t0=time.time()
pred_Train=clf.predict(X_train) 
pred_Test=clf.predict(X_test) 
t1=time.time()
print(t1-t0,"seconds")
# print(pred_Train[:10],pred_Test[:10])
# print(clf.predict([train_set[5]]))

22998.42781996727 seconds
12805.782129049301 seconds


In [11]:
print(cohen_kappa_score(y_train, pred_Train),'Train data set cohen kappa score')
print(cohen_kappa_score(y_test, pred_Test),'Test data set cohen kappa score')

confusion_matrix(y_train, pred_Train)

print(accuracy_score(y_train, pred_Train),'Accuracy Score Train Data set')
print(accuracy_score(y_test, pred_Test), 'Accuracy Score Test Data set')

print(classification_report(y_train, pred_Train))
print(classification_report(y_test, pred_Test))

print(metrics.f1_score(y_train, pred_Train,average='weighted'),'matrics f1 score train data set')
#22998.42781996727 seconds
#12805.782129049301 seconds

0.9994517860065777 Train data set cohen kappa score
0.0019502601948914178 Test data set cohen kappa score
0.9994564626589847 Accuracy Score Train Data set
0.007820136852394917 Accuracy Score Test Data set
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        71
          1       1.00      1.00      1.00       107
          2       1.00      1.00      1.00        75
          3       1.00      1.00      1.00        95
          4       1.00      1.00      1.00        69
          5       1.00      1.00      1.00        70
          6       1.00      1.00      1.00        90
          7       1.00      1.00      1.00       102
          8       1.00      1.00      1.00        76
          9       1.00      1.00      1.00        95
         10       1.00      1.00      1.00        71
         11       1.00      1.00      1.00        99
         12       1.00      1.00      1.00        71
         13       1.00      0.99      0.99        87

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [12]:
confusion_matrix(y_train, pred_Train)

array([[ 71,   0,   0, ...,   0,   0,   0],
       [  0, 107,   0, ...,   0,   0,   0],
       [  0,   0,  75, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  86,   0,   0],
       [  0,   0,   0, ...,   0,  74,   0],
       [  0,   0,   0, ...,   0,   0,  77]])

In [14]:
print(confusion_matrix(y_test, pred_Test))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
