In [7]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

import os
print(os.listdir("../input"))

In [11]:
# Using Kaggle Notebook to run it
train = pd.read_csv('../input/humpback-whale-identification/train.csv')

In [12]:
train.head()

In [13]:
train['Id'].describe()

In [33]:
y_train = train['Id']

## Image Preprocessing

In [52]:
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input

def prepareImages(train, shape, path):
    
    X_train = np.zeros((shape, 100, 100, 3))
    count = 0
    
    for fig in train['Image']:
        
        #load images into images of size 100x100x3
        img = image.load_img("../input/humpback-whale-identification/"+path+"/"+fig, target_size=(100, 100, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)

        X_train[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    
    return X_train

In [53]:
X_train = prepareImages(train, train.shape[0], 'train')
X_train/=255

In [54]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train = to_categorical(y_train, num_classes = 5005)

In [38]:
y_train.shape

## Model Building

In [56]:
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dropout, Flatten, MaxPooling2D, Conv2D, Dense
from keras.layers.normalization import BatchNormalization

In [57]:
model = Sequential()

model.add(Conv2D(32, (5,5), strides = (1,1), padding='same', activation = 'relu', input_shape = (100, 100, 3)))
model.add(Conv2D(32, (5,5), strides = (1,1), padding = 'same', activation='relu'))
model.add(MaxPooling2D((2,2)))

model.add(Conv2D(32, (3,3), strides = (2,2), padding='same', activation='relu'))
model.add(Conv2D(32, (3,3), strides = (2,2), padding='same', activation='relu'))
model.add(MaxPooling2D((2,2), strides = (2,2)))

model.add(Conv2D(64, (3,3), strides = (1,1), padding='same', activation='relu'))
model.add(Conv2D(64, (3,3), strides=(1,1), padding='same', activation='relu'))
model.add(MaxPooling2D((2,2), strides = (2,2)))

model.add(Dropout(0.2))
model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(y_train.shape[1], activation = 'softmax'))

In [58]:
model.summary()

In [59]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
### Checking out the loss and accuracy of the model through the training process

In [60]:
epochs = 100
batchsize = 1024

In [61]:
history = model.fit(X_train, y_train, epochs = epochs, batch_size = batchsize, verbose=2)

In [63]:
plt.plot(history.history['loss'], color='r', label="Train Loss")
plt.title("Train Loss")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [64]:
plt.plot(history.history['acc'], color='g', label="Train Accuracy")
plt.title("Train Accuracy")
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
print('Train loss of the model: ',history.history['loss'][-1])

### Train loss of the model:  0.7547608720598656

In [65]:
print('Train accuracy of the model: ',history.history['acc'][-1])

### Train accuracy of the model:  0.7667284407270081

In [68]:

test = os.listdir("../input/humpback-whale-identification/test/")
print(len(test))

In [69]:
test_data = pd.DataFrame(test, columns=['Image'])
test_data['Id'] = ''

In [70]:
X_test = prepareImages(test_data, test_data.shape[0], "test")
X_test /= 255

In [73]:
predictions = model.predict(np.array(X_test), verbose=1)