# Experimento 8 : Ensamblaje de modelos.

En este experimento se estudiarán los beneficios de usar modelos ensamblados. La idea inicial surge de (Harangi et al., 2018) donde se propone un ensamblaje formado por AlexNet, VGGNet y GoogLeNet. 

Los resultados del estudio incentivan el uso de esta estrategia para conseguir precisión extra.

La hipótesis que se pretende validar en este estudio es:

- La mejora de la precisión derivada del ensamblaje de modelos justifica el incremento de parámetros.

## Librerías usadas.

In [1]:
import tensorflow as tf

gpus= tf.config.experimental.list_physical_devices('GPU')
print(gpus)
tf.config.experimental.set_memory_growth(gpus[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import math 
from glob import glob
from matplotlib import pyplot as plt
import os
from tqdm import tqdm
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample

## Definición de rutas

In [3]:
#Rutas de los datos.
 
data_dir = os.path.dirname(os.path.realpath("../TFG/Datos/HAM10000_metadata.csv"))



csv_path = os.path.realpath(data_dir + "/HAM10000_metadata.csv")

#Variables globales

altura = 128
longitud = 128
clases = 7


print(data_dir)

print(csv_path)



/home/antgarnie/Escritorio/TFG/Datos
/home/antgarnie/Escritorio/TFG/Datos/HAM10000_metadata.csv


## Creación del marco de datos.

In [4]:
#Inicializando el dataFrame

dataFrame=pd.read_csv(csv_path)

#Mezclando carpetas.

all_image_path = glob(os.path.join(data_dir, '*', '*'))
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_image_path}

# Inicializando diccionario de categorías

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

#Añadiendo columnas al dataFrame para que sea más legible.

dataFrame['path'] = dataFrame['image_id'].map(imageid_path_dict.get)
dataFrame['cell_type'] = dataFrame['dx'].map(lesion_type_dict.get) 
dataFrame['cell_type_idx'] = pd.Categorical(dataFrame['cell_type']).codes
dataFrame.head()


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/home/antgarnie/Escritorio/TFG/Datos/HAM10000_...,Benign keratosis,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/home/antgarnie/Escritorio/TFG/Datos/HAM10000_...,Benign keratosis,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/home/antgarnie/Escritorio/TFG/Datos/HAM10000_...,Benign keratosis,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/home/antgarnie/Escritorio/TFG/Datos/HAM10000_...,Benign keratosis,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/home/antgarnie/Escritorio/TFG/Datos/HAM10000_...,Benign keratosis,2


## Preparación de la red



In [5]:
def select_network(nn_base_arch):

    #Familia VGG
    if nn_base_arch == 'VGG16':
        nn = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))   
    if nn_base_arch == 'VGG19':  
        nn = tf.keras.applications.VGG19(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
    
    
    #Familia MobileNet
    if nn_base_arch == 'MNv1':
        nn = tf.keras.applications.MobileNet(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
    if nn_base_arch == 'MNv2':
        nn = tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
        
        
    #Entradas mayor de 75 x 75    
    if nn_base_arch == 'IV3':
        nn = tf.keras.applications.InceptionV3(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
        
    #Entradas  mayor de 72 x 72
    if nn_base_arch == 'Xception':
        nn = tf.keras.applications.Xception(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
          
    if nn_base_arch == 'ENB4':
        nn = tf.keras.applications.EfficientNetB4(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
    
    if nn_base_arch == 'ResNet50':  
        nn = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
    
    
    if nn_base_arch == 'ResNet152v2':  
        nn = tf.keras.applications.ResNet152V2(weights='imagenet', include_top=False, input_shape=(altura, longitud,3))
    
    return nn

def build(nn):
    model = tf.keras.Sequential()
    model.add(nn)
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(128))
    model.add(tf.keras.layers.PReLU())

    model.add(tf.keras.layers.Dense(clases,activation='softmax'))

    print(model.summary())

    return model

In [6]:
nn_base_arch = 'IV3'
nn = select_network(nn_base_arch)
modelIV3 = build(nn)

nn_base_arch = 'VGG16'
nn = select_network(nn_base_arch)
modelVGG = build(nn)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_v3 (Model)         (None, 2, 2, 2048)        21802784  
_________________________________________________________________
flatten (Flatten)            (None, 8192)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               1048704   
_________________________________________________________________
p_re_lu (PReLU)              (None, 128)               128       
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
Total params: 22,852,519
Trainable params: 22,818,087
Non-trainable params: 34,432
_________________________________________________________________
None
Model: "sequential_1"
_________________________________________________________________
Layer (type)

## Creación del modelo

In [18]:
def ensembled_models(modelIV3,modelVGG,clases = 7,input_shape=(128, 128, 3)):
    image = tf.keras.layers.Input(shape=input_shape)
    x  =  modelIV3(image)
    x1 = modelVGG(image)
    

    x = tf.keras.layers.concatenate([x,x1])    #Modificar esta capa en función del caso de estudio
    
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.Dense(clases,activation='softmax')(x)

    return tf.keras.models.Model(inputs=image, outputs=x)

In [60]:
model = ensembled_models(modelIV3,modelVGG,clases = 7,input_shape=(128, 128, 3))

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 3, mode = 'min') 
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model = model.fit(x_train, y_train,validation_data=(x_validation, y_validation),epochs=20,callbacks=[earlyStopping],batch_size = BATCH_SIZE)
evaluation = model.model.evaluate(x_test, y_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


## Método de balaceo de datos

In [7]:
def balanced_dataset(df):
    df_balanced = pd.DataFrame()
    #df = pd.DataFrame()
    
    for cat in df['cell_type_idx'].unique():
        temp = resample(df[df['cell_type_idx'] == cat], 
                        replace=True,     # sample with replacement
                        n_samples=10,   # to match majority class
                        random_state=123) # reproducible results

        # Combine majority class with upsampled minority class
        df_balanced = pd.concat([df_balanced, temp])
 
    df_balanced['cell_type'].value_counts()

    return df_balanced

def load_img_data(size, df, balanced=False):
    """
        ..
        first we should normalize the image from 0-255 to 0-1
    """
    
    img_h, img_w = size, size
    imgs = []
    
    if balanced:
        df = balanced_dataset(df)
    
    image_paths = list(df['path'])

    for i in tqdm(range(len(image_paths))):
        img = cv2.imread(image_paths[i])
        img = cv2.resize(img, (img_h, img_w))
        img = img.astype(np.float32) / 255.
        #img = np.asarray(Image.open(image_paths[i]).resize((size,size)))
        imgs.append(img)

    imgs = np.stack(imgs, axis=0)
    print(imgs.shape)

    #imgs = imgs.astype(np.float32) / 255.
    
    return imgs, df['cell_type_idx'].values

## Cargamos los datos y creamos los casos a experimentar.

In [8]:
def load_general_data():
    
    imgs, target = load_img_data(altura, dataFrame, balanced=True)
    
    x_train, x_transferLearning, y_train, y_transferLearning = train_test_split(imgs, target, test_size=0.60)
       
    source_data = [ x_transferLearning , y_transferLearning ]
    target_data = [ x_train , y_train ]
    
    x_train,x_test,y_train,y_test = train_test_split(target_data[0], target_data[1], test_size=0.70)
    
    train_data = [x_train,y_train]
    test_data = [x_test,y_test]
    
    return source_data,train_data,test_data


def get_data_for_ex(source_data,train_data,test_data):
    
    x_train = source_data[0]
    y_train = source_data[1]
    
    x_retrain = train_data[0]
    y_retrain = train_data[1]
    
    percent = math.floor(len(test_data[0])/100*30)
       
    x_validation = test_data[0][0:percent]
    y_validation = test_data[1][0:percent]
    
    
    x_test = test_data[0][percent:-1]
    y_test = test_data[1][percent:-1]
    
    return x_train,x_retrain,x_test,x_validation,y_train,y_retrain,y_test,y_validation


###############################################################################################################
# Definimos 7 experimentos cada uno con un optimizador distingo y definimos el número de iteraciones          #
###############################################################################################################

ITERATIONS_PER_EXP = 5
BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE=0.0001


def set_hiper_to_exp(BATCH_SIZE,EPOCHS,LEARNING_RATE):
    opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,amsgrad=True)  
    return BATCH_SIZE,EPOCHS,opt

In [8]:
source_data,train_data,test_data = load_general_data()
x_train,x_retrain,x_test,x_validation,y_train,y_retrain,y_test,y_validation = get_data_for_ex(source_data,train_data,test_data)

100%|██████████| 70/70 [00:00<00:00, 146.00it/s]

(70, 128, 128, 3)



