# Ensamble of Convolutional Neural Networks for classification of masses and calcifications in mammogram images

Fabio D'Onofrio

matricola : 556505

Università degli studi di Pisa

Progetto del corso di Computational Intelligence

This Notebook includes $two$ $sections$.

The $first$ $section$ includes setting of the Drive directory in which mammogram images(both training set and test set) are stored, already preprocecced and patched, as numpy arrays with float values in the range 0-65535 and labels(0 corresponds to masses while 1 indicates calcification). In this section useful modules are also imported and some functions are defined in order to be used in the second section.

In the $second$ $section$ the predictions of the best trained models are combined through a standard average and also with a weighted average with validation accuracies as weights. 

# SECTION 1 : Set the environment and download the data

In [0]:
#@title Set the working directory
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

data_dir = './drive/My Drive/Computational Intelligence 2019/FinalProject/'

Mounted at /content/drive/


In [0]:
#@title Import modules
#@markdown --- **os** for operating system dependent functionality
import os
#@markdown --- **json** to use JSON data-interchange format
import json
#@markdown --- **numpy** to handle multidimensional data
import numpy as np
#@markdown --- **matplotlib** for figures
import matplotlib.pyplot as plt
#@markdown --- **keras** deep learning library
from   keras import models
from   keras import layers
from   keras.callbacks import EarlyStopping
from   keras.callbacks import ModelCheckpoint
from   keras.preprocessing.image import ImageDataGenerator
from   keras.models import load_model
from   keras import optimizers as opt
#@markdown --- **random** to generate pseudo-random numbers
import random as rn
from sklearn.metrics import log_loss

Using TensorFlow backend.


In [0]:
#@title Class data_setting 
class data_setting :
#@markdown ---Member variables: **train_data, train_labels, test_data, test_labels** 
    def __init__(self,train_data,train_labels,test_data,test_labels):      
    # Member variables initialization 
    # These four variables will be assigned to provided numpy arrays 
        self.train_data   = train_data
        self.train_labels = train_labels
        self.test_data    = test_data
        self.test_labels  = test_labels
#@markdown ---Method **shuffleDataset()** shuffles the data set
    def shuffleDataset(self):
    # This method shuffles the training data set   
        dataset           = list(zip(self.train_data,self.train_labels))  
      
        rn.shuffle(dataset)
        
        self.train_data ,  self.train_label  = np.array(list(zip(*dataset)))
#@markdown ---Method **data_manipulation()** reshapes and normalizes training and test sets
    def data_manipulation(self) :
    # This method shuffles,reshapes, and normalizes training and test sets
        bits_per_pixel   = 16       

        #self.shuffleDataset()

        self.train_data = self.train_data.reshape(np.shape(self.train_data)+(1,))
        self.train_data = self.train_data.astype('float32')/(2**bits_per_pixel-1)

        self.test_data  = self.test_data.reshape(np.shape(self.test_data)+(1,))
        self.test_data  = self.test_data.astype('float32')/(2**bits_per_pixel-1)
                
##############################################################################
        

In [0]:
#@title Other functions used to search for the best model

#@markdown --- **build_dict()** loads all previous trained CNN's dictionaries and stores them in a single variable    
def build_dict(models_dir,models_folder_name) :
# This method iterates over all folders containing models_folder_name in their names, in the working directory 
# and returns a dictionary containing all models dictionaries found
  models_dictionaries = {}

  for models_folder in os.listdir(models_dir) : 

    if models_folder_name in models_folder :

      for files in os.listdir(models_dir + '/' + models_folder) :

        if 'Model_dict_' in files :

          mod_dict = json.load(open(models_dir + '/' + models_folder + '/' + files )) 

          mod_key = 'mod_'+models_folder[models_folder.find('_')+1:] + '_' + files[files.find('_')+1:]
          models_dictionaries[mod_key] = mod_dict
          model_number = files[files.find('t')+2:]
          model_path = models_dir + models_folder + '/'
          models_dictionaries[mod_key]['path'] = { 'model_path' : model_path , 'model_number' : model_number }
          
  return models_dictionaries
#@markdown ---  **best_acc_model()** returns the best trained CNNs
def best_acc_model(models_dictionaries , n ) :

  """
  This function finds the model with the best n test accuracy and outputs the respective dictionary
  """
  test_dictionaries = {}
  for mods in list(models_dictionaries.keys()) :
    test_dictionaries[mods]=models_dictionaries[mods]['mod_history']['test_acc']

  sorted_by_value = sorted(test_dictionaries.items(), key=lambda x: x[1])
  best_n_models = [k[0] for k in sorted_by_value[-n:]]

  return [models_dictionaries[i] for i in best_n_models ]

#@markdown --- **load_best_models()** loads the n best CNNs
def load_best_models(folder , n) :
    
    best_models_dir = data_dir + folder + '/'
    best_models_h5 = []
    models_dict = build_dict(data_dir,folder)
    best_models_dict = best_acc_model(models_dict , n )
    
    for mod in best_models_dict :
        mod_number = mod['path']['model_number']
        best_models_h5.append(load_model(best_models_dir+'model_'+str(mod_number)+'.h5'))
    
    return best_models_h5

##############################################################################  

In [0]:
#@title Download data
############################################################################################################################################################
#@markdown --- **train_images_150** numpy array of shape (training batch size,width,height)=(2864,150,150)
train_images_150=np.load(data_dir + 'train_img_150.npy')        # 150x150 pixels training images with values in [0,2^16 -1]
#@markdown --- **train_labels** numpy array of shape (batch size,)=(2864,)
train_labels=np.load(data_dir + 'train_lab.npy')                # training labels
#@markdown --- **test_images_150** numpy array of shape (test set size,width,height)=(352,150,150)
test_images_150=np.load(data_dir + 'public_test_image_150.npy') # 150x150 pixels test images  with values in [0,2^16 -1]
#@markdown --- **test_labels** numpy array of shape (test set size,)=(352,)
test_labels=np.load(data_dir + 'public_test_label.npy')         # test labels


# SECTION 2 : Ensamble of Neural Networks

In [0]:
#@title Ensamble network will be composed of some of the best trained from scratch CNNs and the pre-trained CNN 

data_set = data_setting(train_images_150,train_labels,test_images_150,test_labels)
data_set.data_manipulation()
#@markdown Set the name of the folders in which best trained from scratch models have been stored
best_models_folder_name = 'Best_TrainedModels' #@param {type:"string"}

models_dictionaries = build_dict(data_dir,best_models_folder_name)

#@markdown Set how many of the best models trained from scratch must be used in the Ensemble Network
n_bests = 3 #@param {type : "integer" }

best_models_dict = best_acc_model(models_dictionaries , n_bests ) 

best_models_h5 = load_best_models(best_models_folder_name , n_bests)

#@markdown Set the name of the folder in which the best pre-trained CNN has been saved
pt_folder = 'final_PT' #@param {type: "string" }

pt_model = load_model(data_dir+pt_folder+'/best_model.h5')

#@markdown Set the name of the folder in which the CNN models of the Ensamble Network will be stored
ensamble_folder = 'Ensamble' #@param {type : "string" }
if not os.path.exists(data_dir+ensamble_folder+'/') :
  os.mkdir(data_dir+ensamble_folder+'/')
pt_model.save(data_dir+ensamble_folder+'/PT_model.h5')
pt_model_dict = json.load(open(data_dir+pt_folder+'/Model_dict')) 
three_channels_test_data=data_set.test_data*np.ones((len(data_set.test_data),1,1,3))
pt_predictions = pt_model.predict(three_channels_test_data)

models_predictions = []
i=1
for mods in best_models_h5 : 
  mods.save(data_dir+ensamble_folder+'/Scracth_Model_'+str(i)+'.h5')
  models_predictions.append(mods.predict(data_set.test_data))
  i += 1

models_predictions.append(pt_predictions)
best_models_dict.append(pt_model_dict)

weighted_average_prediction = 0
best_models_val_acc = []
for i in range(n_bests+1) :
    last_val_acc = best_models_dict[i]['mod_history']['val_acc'][-1]
    best_models_val_acc.append(last_val_acc)
    weighted_average_prediction += models_predictions[i]*last_val_acc
    
weighted_average_prediction = weighted_average_prediction/sum(best_models_val_acc)
weighted_loss = log_loss(test_labels,weighted_average_prediction)
print('Binary Cross Entropy loss of the weighted average predictions is : ' , weighted_loss)

weighted_average_prediction = [ round(float(b)) for b in weighted_average_prediction]    
correct_avg_predictions = np.sum(np.equal(weighted_average_prediction,test_labels))
weighted_accuracy = correct_avg_predictions/len(test_labels)
print('Weighted average accuracy is : ' , weighted_accuracy)
    
average_prediction=[]   
for i in range(len(test_labels)) :
    prediction = 0
    for j in range(n_bests+1) :
        prediction += models_predictions[j][i]
    average_prediction.append(prediction/(n_bests+1))

average_loss = log_loss(test_labels,average_prediction)
print('Binary Cross Entropy loss of the average predictions is : ' , average_loss)

average_prediction = [ round(float(b)) for b in average_prediction]    
correct_predictions = np.sum(np.equal(average_prediction,test_labels))
accuracy = correct_predictions/len(test_labels)
print('Average accuracy is : ' ,accuracy)


Binary Cross Entropy loss of the weighted average predictions is :  0.2636623165245427
Weighted average accuracy is :  0.9034090909090909
Binary Cross Entropy loss of the average predictions is :  0.26416203314593906
Average accuracy is :  0.9005681818181818
