# Behold.ai Project
Multi-Label Image Classification

Data from RSNA Intracranial Hemorrhage Detection via Kaggle

In [1]:
#Mount Google Drive folder where data is stored
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
!ls
%cd gdrive/My Drive/Behold Project

Mounted at /content/gdrive
gdrive	sample_data
/content/gdrive/My Drive/Behold Project


# Data Preprocessing

In [63]:
#Import necessary libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing import image
from os import listdir
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

#Function for loading dataset
def load_dataset(dataset_dir, is_train=True, csv_filename=''):
  '''Loads each image into an array, normalises, and then appends to a list to create a machine-friendly dataset.
  If train=True, it also loads the .csv file with target values. If test, it extracts filenames for the predictions.

  Args = dataset_dir(str)
        Where the dataset is stored(train or test)
       = is_train(boolean, True by default)
        Boolean switch to determine if a .csv file is read for given Y values
       = csv_filename(str)
        filename for the given Y values as a .csv

  Returns = if is_train is true:
            X (dataset of images as np array)
            y (given target values as np array)
          = if is_train is false:
            X (dataset of images as np array)
            filenames (list of filenames from dataset)
  '''
  if is_train:
    data = pd.read_csv(csv_filename)
    data.head()
    
    images = []
    for i in tqdm(range(data.shape[0])):
      img = image.load_img(dataset_dir + '/' + data['ID'][i]+'.png')
      img = image.img_to_array(img)
      img = img/255
      images.append(img)
    X = np.array(images)
    y = np.array(data.drop(data.columns[[0, 1]], axis=1))
    return X, y
    
  else:
    images = []
    filenames = []
    for filename in tqdm(listdir(dataset_dir)):
      img = image.load_img(dataset_dir + '/' + filename)
      filenames.append(filename)
      img = image.img_to_array(img)
      img = img/255
      images.append(img)
    X = np.array(images)
    return X, filenames

In [62]:
#Load datasets, also check shapes
[X, y] = load_dataset('train_images', is_train=True, csv_filename = 'behold_coding_challenge_train.csv')
print(X.shape)
print(y.shape)
[test_set, filenames] = load_dataset('test_images', is_train=False)
print(test_set.shape)

#Split training data into train set and a validation set to measure accuracy
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2)

100%|██████████| 8542/8542 [00:11<00:00, 739.09it/s]
  2%|▏         | 67/4019 [00:00<00:06, 650.53it/s]

(8542, 128, 128, 3)
(8542, 3)


100%|██████████| 4019/4019 [00:04<00:00, 889.00it/s]


(4019, 128, 128, 3)


# Model Building and Training

In [64]:
#Build a simple CNN model with 3 convolutional layers, 
#each layer with max pooling and dropout, and 3 dense layers
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=(5, 5), activation="relu", input_shape=(128,128,3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=32, kernel_size=(5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=64, kernel_size=(5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid'))

model.summary()

#Compile model with binary cross-entropy (as multi-label problem)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 124, 124, 16)      1216      
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 62, 62, 16)        0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 62, 62, 16)        0         
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 58, 58, 32)        12832     
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 29, 29, 32)        0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 29, 29, 32)        0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 26912)            

In [65]:
#Model training

model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), batch_size=16)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f2608358fd0>

# Model Evaluation

In [60]:
#Run some predictions on the test set
def model_predictor(test_set, filenames, is_bool=True):
  '''Predicts probabilities of each example having each label, converts to dataframe,
  and then saves to a .csv file

  Args = test_set(np array)
        Test set of images
       = filenames(list of str)
        List of filenames from the test set so we can see which image has which prediction
       = is_bool(boolean)
        Boolean switch to toggle whether the user wants boolean (as in integer or binary)
        predictions or probabilities (i.e. floats), and adjusts file name accordingly

  Returns = Doesn't return anything, just saves the predictions as a .csv
  '''
    pred = model.predict(test_set)
    if is_bool:
      preds = (pred > 0.5)
      predictions = preds.astype(int)
    else:
      predictions = pred.astype(float)
    
    #Sort out columns
    columns=["epidural", "intraparenchymal", "subarachnoid"]
    results=pd.DataFrame(predictions, columns=columns)
    results["Filenames"]= filenames
    ordered_cols=["Filenames"] + columns
    results=results[ordered_cols]

    if is_bool:
      results.to_csv("results_binary.csv",index=False)
    else: 
      results.to_csv("results_probabilities.csv",index=False)

model_predictor(test_set, filenames, is_bool=True)
model_predictor(test_set, filenames, is_bool=False)