# Rerencias/Bibliografía

### Link: https://towardsdatascience.com/simple-image-data-augmentation-technics-to-mitigate-overfitting-in-computer-vision-2a6966f51af4
Técnicas de **data augmentation** para imágenes

### Link: https://albumentations.ai/docs/getting_started/image_augmentation/
Página oficial de la biblioteca **Albumentations**, utilizada para crear pipelines con operaciones aleatorias a realizar sobre las imágenes para generar un conjunto de datos más grande.

In [None]:
import numpy as np
import albumentations as A
import matplotlib.pyplot as plt

## Cargando las bases de datos

In [None]:
x_train_valid = np.load('../input/cnn-itba-2021-q2/X_train.npy')
y_train_valid = np.load('../input/cnn-itba-2021-q2/y_train.npy')
x_test = np.load('../input/cnn-itba-2021-q2/X_test.npy')

Veo algunas imágenes:


In [None]:
plt.figure(figsize=(2,2))
plt.imshow(x_train_valid[37])
plt.title(str(y_train_valid[37]))
plt.show()

# Preprocesamiento de entradas para ResNet50

In [None]:
from tensorflow.keras.applications.resnet import preprocess_input

In [None]:
x_train_valid = preprocess_input(x_train_valid)
x_test = preprocess_input(x_test)

# Se separan conjuntos para entrenamiento y validación

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train_valid, y_train_valid, test_size=0.2, random_state=15, stratify=y_train_valid)

# Data Augmentation con Albumentation
Se aplica **data augmentation** utilizando la biblioteca Albumentations para aumentar el tamaño del conjunto de datos. El conjunto de validación permanece intacto para validar que la técnica tuvo buenos resultados sin contaminar los datos de dicho conjunto.

In [None]:
from tensorflow.keras.utils import Sequence

from albumentations import Compose, ToFloat, HorizontalFlip, VerticalFlip, Rotate, RandomSizedCrop, ShiftScaleRotate, GridDistortion
from albumentations import ElasticTransform, RandomBrightnessContrast

In [None]:

#Esta clase fue sacada directamente de la página de la documentación de la librería de albumentations
class AugmentedSequence(Sequence):
  """ Dataset generator with data augmentation """

  def __init__(self, x, y, batch_size, augmentation, shuffle=True):
    """ Create an instance of the data augmented generator, which is a 
        dataset generator to provide 'on the fly' data augmentation.
        @param x
        @param y
        @param batch_size
        @param augmentation
        @param shuffle
    """
    # Save internal parameters of the augmented sequence
    self.x = x
    self.y = y
    self.batch_size = batch_size
    self.augmentation = augmentation
    self.shuffle = shuffle

    # Initialization
    self.on_epoch_end()
  
  def __len__(self):
    """ Compute the length of an epoch measured in batches
    """
    return int(np.floor(len(self.x) / float(self.batch_size)))
  
  def __getitem__(self, index):
    """ Return the item from the sequence at the given index
        @param index
    """
    # Generate indexes of the batch
    indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]

    # Extract the input and output batch from the original dataset
    batch_x = self.x[indexes]
    batch_y = self.y[indexes]
    
    # Return an augmented version of the batch
    return np.array([
      self.augmentation(image=x)['image'] for x in batch_x
    ]), np.array(batch_y)

  def on_epoch_end(self):
    """ Updates indexes after each epoch
    """
    self.indexes = np.arange(len(self.x))
    if self.shuffle is True:
        np.random.shuffle(self.indexes)


In [None]:
#Se crea la secuencia de datos aumentada (40mil imágenes)
album_generator = AugmentedSequence(x_train,
                                    y_train,
                                    40000,
                                    Compose([
                                        ShiftScaleRotate(shift_limit=0.1,
                                                         scale_limit=0.2,
                                                         rotate_limit=30,
                                                         p=0.5),
                                        HorizontalFlip(p=0.5),
                                        VerticalFlip(p=0.5),
                                        GridDistortion(p=0.2),
                                        ElasticTransform(p=0.2),
                                        RandomBrightnessContrast(p=0.2)
                                        ])
                                    )

# Incluyo modelos

In [None]:
from tensorflow.keras.layers import Dense, Flatten, Activation, BatchNormalization, Dropout, Conv2D, MaxPooling2D, InputLayer, AveragePooling2D, Input, UpSampling2D, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow import keras

## ResNet 50

In [None]:
from tensorflow.keras.applications.resnet import ResNet50 

In [None]:
# It's important to 
resnet_model = ResNet50 (weights="imagenet", include_top=False, input_shape=(256,256,3), classes=100)

In [None]:
resnet_model.summary()

In [None]:
# Entreno solamente aquellas partes que contienen batch normalization
for layer in resnet_model.layers:
    if isinstance(layer, BatchNormalization):
        layer.trainable = True
    else:
        layer.trainable = False 

In [None]:
# Creo capas
model = Sequential()

# Upsampling para que el tamaño de los datos coincida con los de la red resnet
model.add(UpSampling2D()
model.add(UpSampling2D())
model.add(UpSampling2D())

# Cargo resnet
model.add(resnet_model)

# GlobalAVGPooling 
model.add(GlobalAveragePooling2D())
model.add(BatchNormalization()) 

# Fully connected network de salida
# Layer 1
model.add(Dense(units=1024))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(Dropout(0.4))

# Layer 2
model.add(Dense(units=1024))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(Dropout(0.4))

# Capa Softmax 
model.add(Dense(units=100))
model.add(BatchNormalization())
model.add(Activation('softmax'))


# Compilo modelo
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['accuracy']
             )

In [None]:
# Checkpoint callback para guardar mejor modelo en cada iteración
mc_callback = ModelCheckpoint('model_eff.hdf5',
                              monitor='val_accuracy',
                              save_best_only=True,
                              verbose=0,
                              mode='max'
                             )

# Entreno el modelo
epochs = 5
batch_size = 64
augmented_factor = 5
for i in range(augmented_factor):
    print(f'******** Iteration {i+1} of {augmented_factor+1} ********')
    batch_x, batch_y = album_generator[0]
    model.fit(batch_x,
            batch_y, 
            validation_data=(x_valid, y_valid), 
            callbacks=[mc_callback],
            batch_size=batch_size,
            epochs=epochs
            )

In [None]:
# Cargo el modelo y muestro métricas
model = keras.models.load_model('model_eff.hdf5')

# métricas de train y validation
_, train_acc = model.evaluate(x_train, y_train, verbose=0)
_, valid_acc = model.evaluate(x_valid, y_valid, verbose=0)

# Muestro resultados
print(f'[Accuracy] Train: {round(train_acc, 3)} Valid: {round(valid_acc, 3)}')

In [None]:
def generate_submission(predictions, filepath='submission.csv'):
    """ Generate the .csv file to submit in the challenge
        @param predictions Predictions made by the model from the test dataset
        @param filepath Filepath for the file generated
    """
    df = pd.DataFrame(predictions, columns=['label'])
    df.index.name = 'Id'
    df.to_csv(filepath)

In [None]:
import pandas as pd

In [None]:
# Predict over the test set
y_pred = model.predict(x_test).argmax(axis=-1)

# Save submission
generate_submission(y_pred, filepath='submissiona2.csv')