In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import PIL
from tqdm import tqdm
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from matplotlib import rcParams

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from tensorflow import keras

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.imagenet_utils import preprocess_input

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.utils import to_categorical


In [None]:
#If running on Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Change as required
root = '/content/drive/MyDrive/ML_Project'
train_csv = root + '/train.csv'
train_images = root + '/train_images'
test_images = root + '/test_images'

In [None]:
#Read in training data from .csv and address the specie labelling errors
train_df = pd.read_csv(train_csv)
train_df['image_path'] = train_images +'/'+ train_df['image']

train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'globis_whale'

train_df.loc[train_df.species.str.contains('dolpin'), 'species'] = 'bottlenose_dolphin'
train_df.loc[train_df.species.str.contains('kiler'), 'species'] = 'killer_whale'

print(f'List of Unique Species:\n {train_df.species.unique()}')

print(f'\n Number of individual species, updated: {train_df.species.nunique()} \n')


In [None]:
def imageLoad(paths):

  X_train = np.zeros((len(paths), 56, 80, 3))
  for i in tqdm(range(len(paths))):

    #Open image from path
    img = image.load_img(paths[i], target_size = (56, 80), color_mode = 'rgb')
    #Convert to array
    x = image.img_to_array(img)
    #Expand dimensions
    x = np.expand_dims(x, axis = 0)
    #Preprocess input for keras model
    x = preprocess_input(x)
    X_train[i] = x
  return X_train


In [None]:
#Read in all augmented images.
augImg_df = pd.read_csv(root + '/final_generated_train_images.csv')
augImg_df = augImg_df[['individual_id','species', 'image_path']]
print(f'Number of individual IDs: {augImg_df.individual_id.nunique()}')


In [None]:
train_df = train_df.sort_values(by=['individual_id']).reset_index(drop = True)

#Get three images for each individual ID
train_df = train_df.groupby(by=['individual_id']).head(3).reset_index(drop = True)


In [None]:
#Add the augmented images
train_total_df = train_df.append(augImg_df).reset_index(drop = True)

#Redundancy to make sure there are only 3 images per ID
train_total_df = train_total_df.groupby(by=['individual_id']).head(3).reset_index(drop = True)

In [None]:
ID= pd.DataFrame(train_total_df.individual_id.value_counts())

ID_df = pd.DataFrame(columns = ['individual_id', 'image_freq'])

ID_df['individual_id'] = ID.index
ID_df['image_freq'] = ID.values.astype(int)

print(ID_df)

In [None]:
#Sanity check. Should have 15587 classes
print(train_total_df['individual_id'].nunique())

In [None]:
pd.set_option('display.max_colwidth', None)
#Split the training set into a validation set of 1 image per ID
valid_set = train_total_df.groupby('individual_id').head(1).sort_values(by = ['individual_id']).reset_index(drop = True)
print(valid_set)
train_df2 = train_total_df[~train_total_df.index.isin(valid_set.index)].sort_values(by = ['individual_id']).reset_index(drop = True)
print(train_df2)


In [None]:
#Load the training set
Xtrain = imageLoad(train_df2['image_path'])

In [None]:
#Load the validation set
Valid_X = imageLoad(valid_set['image_path'])

In [None]:
#Rescale the array values to be betweeon 0-1
Xtrain /= 255
Valid_X /=255

In [None]:
#Function for OHE labels
def labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(values)
    lEncoded = label_encoder.transform(values)
    onehot_encoder = OneHotEncoder(sparse=False)
    lEncoded = lEncoded.reshape(len(lEncoded), 1)
    onehot_encoder = onehot_encoder.fit(lEncoded)
    OHEncoded = onehot_encoder.transform(lEncoded)
    y = OHEncoded
    return y

In [None]:
#OHE target data for both training and validation

ytrain = train_df2['individual_id']
Ytrain = labels(ytrain)

valid_y = valid_set['individual_id']
Valid_Y = labels(valid_y)

In [None]:
#MODEL V2
model = Sequential()

# First convolution extracts 16 filters that are 3x3
# Convolution is followed by batch normalization and a max-pooling layer with a 2x2 window
model.add(Conv2D(16, 3, activation = 'relu', input_shape = (56, 80, 3)))
model.add(BatchNormalization(axis = 3))
model.add(MaxPooling2D(pool_size = (2,2)))

# Second convolution extracts 32 filters that are 3x3
# Convolution is followed by batch normalization and a max-pooling layer with a 2x2 window
model.add(Conv2D(32, 3, activation='relu'))
model.add(BatchNormalization(axis = 3))
model.add(MaxPooling2D(pool_size = (2,2)))

# Third convolution extracts 64 filters that are 3x3
# Convolution is followed by an average-pooling layer with a 2x2 window and a dropout layer set to 0.4
model.add(Conv2D(64, 3, activation='relu'))
model.add(AveragePooling2D(pool_size = (2,2)))
model.add(Dropout(0.4))

# Flatten feature map to a 1-dim tensor so we can add fully connected layers
model.add(Flatten())

# Create a fully connected layer with ReLU activation and 256 hidden units and a dropout layer set to 0.4
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.4))

# Create output layer with 15587 nodes and softmax activation
model.add(Dense(train_df['individual_id'].nunique(), activation='softmax'))

# Compile model:
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

In [None]:
#Save the model after epoch
callbacks = [ModelCheckpoint(root+"/model_checkpoint", monitor='acc', )]

In [None]:
#Fit the model. Parameters are not tuned and are initial values
history = model.fit(
       x = Xtrain,
       y = Ytrain,
       epochs = 40,
       batch_size = 128,
       verbose = 1,
       callbacks = callbacks,
       validation_data = (Valid_X, Valid_Y)
       )

In [None]:
#plot model accuracy
plt.figure(dpi=1200)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'val'], loc='upper left')
#plt.savefig('Accuracy_plot_1.png',bbox_inches='tight')

In [None]:
#plot model loss
plt.figure(dpi=1200)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'val'], loc='upper left')
#plt.savefig('Loss_plot_1.png',bbox_inches='tight')

In [None]:
model.save(root + '/models/Model_CNN_Validation')