In [1]:
# Imports
import utils
from utils import HELOC_NAME, ADULT_INCOME_NAME, HIGGS_NAME, COVERTYPE_NAME, CALIFORNIA_HOUSING_NAME, ARBOVIRUSES_NAME
import tensorflow as tf
import os
import pandas as pd
import cv2
import numpy as np
import concurrent.futures
from tensorflow.keras.utils import to_categorical

In [2]:
# Hyperparameters
DROPOUT_VALUE = 0.5

BATCH_SIZES = {
    HELOC_NAME: 64,
    ADULT_INCOME_NAME: 64, 
    HIGGS_NAME: 4096,
    COVERTYPE_NAME: 256,
    CALIFORNIA_HOUSING_NAME: 64, 
    ARBOVIRUSES_NAME: 64,
}
EPOCHS = 250

METRICS_CLASSIFICATION_BINARY = [
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
    tf.keras.metrics.BinaryAccuracy()
]

METRICS_CLASSIFICATION_MULTICLASS = [
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
    tf.keras.metrics.CategoricalAccuracy()
]

METRICS_REGRESSION = [
    tf.keras.metrics.MeanSquaredError(),
    tf.keras.metrics.MeanAbsoluteError()
]

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import vgg16, vgg19, resnet50, mobilenet, inception_resnet_v2, densenet, inception_v3, xception, nasnet, ResNet152V2
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization, InputLayer, LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import SGD, Adam, Adadelta, Adamax
from tensorflow.keras import layers, models, Model
from tensorflow.keras.losses import MeanAbsoluteError, MeanAbsolutePercentageError
from tensorflow.keras.layers import Input, Activation,MaxPooling2D, concatenate, AveragePooling2D, Concatenate

def build_model(input_shape, is_classification_dataset, n_classes=None):
    if is_classification_dataset:
        assert n_classes is not None
        assert n_classes > 1
    else:
        assert n_classes is None

    # Define the model architecture

    #Entrada
    input_shape = Input(input_shape)

    #Inicio de rama 1
    tower_1 = Conv2D(16, (3,3), activation='relu',padding="same")(input_shape)
    tower_1 = BatchNormalization()(tower_1)
    tower_1 = Activation('relu')(tower_1)
    # tower_1 = MaxPooling2D(2,2)(tower_1)
    tower_1 = Dropout(DROPOUT_VALUE)(tower_1)

    tower_1 = Conv2D(32, (3,3), activation='relu',padding="same")(tower_1)
    tower_1 = BatchNormalization()(tower_1)
    tower_1 = Activation('relu')(tower_1)
    # tower_1 = MaxPooling2D(2,2)(tower_1)
    tower_1 = Dropout(DROPOUT_VALUE)(tower_1)

    tower_1 = Conv2D(64, (3,3), activation='relu',padding="same")(tower_1)
    tower_1 = BatchNormalization()(tower_1)
    tower_1 = Activation('relu')(tower_1)
    # tower_1 = MaxPooling2D(2,2)(tower_1)
    tower_1 = Dropout(DROPOUT_VALUE)(tower_1)

    tower_1 = Conv2D(64, (3,3), activation='relu',padding="same")(tower_1)
    tower_1 = BatchNormalization()(tower_1)
    tower_1 = Activation('relu')(tower_1)
    # tower_1 = MaxPooling2D(2,2)(tower_1)
    tower_1 = Dropout(DROPOUT_VALUE)(tower_1)
    #Fin de rama 1

    #Inicio de rama 2
    tower_2 = Conv2D(16, (5,5), activation='relu',padding="same")(input_shape)
    tower_2 = BatchNormalization()(tower_2)
    tower_2 = Activation('relu')(tower_2)
    # tower_2 = AveragePooling2D(2,2)(tower_2)
    tower_2 = Dropout(DROPOUT_VALUE)(tower_2)

    tower_2 = Conv2D(32, (5,5), activation='relu',padding="same")(tower_2)
    tower_2 = BatchNormalization()(tower_2)
    tower_2 = Activation('relu')(tower_2)
    # tower_2 = AveragePooling2D(2,2)(tower_2)
    tower_2 = Dropout(DROPOUT_VALUE)(tower_2)

    tower_2 = Conv2D(64, (5,5), activation='relu',padding="same")(tower_2)
    tower_2 = BatchNormalization()(tower_2)
    tower_2 = Activation('relu')(tower_2)
    # tower_2 = AveragePooling2D(2,2)(tower_2)
    tower_2 = Dropout(DROPOUT_VALUE)(tower_2)

    tower_2 = Conv2D(64, (5,5), activation='relu',padding="same")(tower_2)
    tower_2 = BatchNormalization()(tower_2)
    tower_2 = Activation('relu')(tower_2)
    # tower_2 = AveragePooling2D(2,2)(tower_2)
    tower_2 = Dropout(DROPOUT_VALUE)(tower_2)
    #Fin de rama 2

    #Concatenación de las 2 ramas
    merged = Concatenate(axis=1)([tower_1, tower_2])

    #Aplanamiento
    merged = Flatten()(merged)

    #Capas adicionales
    out = Dense(256, activation='relu')(merged)
    out = Dense(128, activation='sigmoid')(out)
    out = Dense(64, activation='sigmoid')(out)
    out = Dense(32, activation='sigmoid')(out)

    #Capa final de clasificación
    if is_classification_dataset:
        if n_classes == 2:
            out = Dense(1, activation='sigmoid')(out)
        else:
            out = Dense(n_classes, activation='softmax')(out)
    else:
        out = Dense(1, activation="linear")(out)

    model = Model(input_shape, out)

    # Compile the model
    model.compile(
        optimizer = "adam",
        metrics = METRICS_REGRESSION if not is_classification_dataset else \
            (METRICS_CLASSIFICATION_BINARY if n_classes == 2 else METRICS_CLASSIFICATION_MULTICLASS),
        loss = "mean_squared_error" if not is_classification_dataset else \
            ("binary_crossentropy" if n_classes == 2 else "categorical_crossentropy")
    )

    return model

In [4]:
def train_model(
    model,
    X_train,
    X_val,
    y_train,
    y_val,
    batch_size
):
    model.fit(
        x = X_train,
        y = y_train,
        validation_data = (X_val, y_val),
        batch_size = batch_size,
        epochs = EPOCHS,
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor = 'val_loss',
                patience = 20,
                restore_best_weights = True,
                start_from_epoch = 20
            )
        ]
    )

In [5]:
IMAGES_FOLDER = "images"

CNN_RESULTS_FOLDER = "cnn_models"
RESULT_FOLDER = os.path.join(utils.get_results_path(), CNN_RESULTS_FOLDER)
if not os.path.exists(RESULT_FOLDER):
    os.mkdir(RESULT_FOLDER)

In [6]:
def get_partial_load_image(image_method_folder):
    def load_image(img_path):
        return cv2.imread(
            os.path.join(image_method_folder, img_path)
        )
    
    return load_image

In [7]:
def train_with_dataset(dataset_name):
    dataset_folder = os.path.join(RESULT_FOLDER, dataset_name)
    if not os.path.exists(dataset_folder):
        os.mkdir(dataset_folder)

    images_folder = utils.get_images_path()
    # Get all available image methods
    available_images = set([
        d for d in os.listdir(images_folder)
        if
            ("supervised.csv" if utils.is_dataset_classification(dataset_name) else "regression.csv")
            in os.listdir(os.path.join(images_folder, d, dataset_name.lower()))
    ])

    # Get all already done image methods
    finished_methods = set([
        f.split(".")[0] for f in os.listdir(dataset_folder)
    ])
    # Get all remaining methods
    remaining_methods = available_images - finished_methods

    if not remaining_methods:
        return

    # Get the dataset
    X,y = utils.get_X_y(dataset_name)
    del X   # In this experiment I use images (not the raw data)

    # Get the indices for train and validation split
    indices_train,indices_val = utils.get_indices_train_eval(dataset_name)
    y_train = y[indices_train]
    y_val = y[indices_val]

    if utils.is_dataset_classification(dataset_name):
        is_classification_dataset = True
        n_classes = np.unique(y).shape[0]
        assert n_classes > 1
    else:
        is_classification_dataset = False
        n_classes = None

    if utils.is_dataset_multiclass_classification(dataset_name):
        y_train = to_categorical(y_train, num_classes=n_classes)
        y_val = to_categorical(y_val, num_classes=n_classes)
    del y

    for image_method in sorted(remaining_methods):
        print(image_method)
        image_method_folder = os.path.join(images_folder, image_method, dataset_name.lower())

        # Load the routes to the images
        csv_file_path = os.path.join(image_method_folder, ("supervised.csv" if utils.is_dataset_classification(dataset_name) else "regression.csv"))
        image_paths = pd.read_csv(csv_file_path)

        func_load_image = get_partial_load_image(image_method_folder)

        image_paths_np = image_paths["images"].to_numpy()
        train_paths = image_paths_np[indices_train]
        val_paths = image_paths_np[indices_val]

        del image_paths

        # Load train and validation images
        with concurrent.futures.ThreadPoolExecutor() as executor:
            X_train = np.array(list(executor.map(func_load_image, train_paths)))
            X_val = np.array(list(executor.map(func_load_image, val_paths)))

        # Build the model
        model = build_model(
            input_shape = X_train[0].shape,
            is_classification_dataset = is_classification_dataset,
            n_classes = n_classes
        )
        
        # Train the model
        train_model(
            model = model,
            X_train = X_train,
            X_val = X_val,
            y_train = y_train,
            y_val = y_val,
            batch_size = BATCH_SIZES[dataset_name]
        )

        # Save the model
        model.save(os.path.join(dataset_folder, f"{image_method}.keras"))
        print()

## HELOC

In [8]:
train_with_dataset(HELOC_NAME)

## Adult Income

In [9]:
train_with_dataset(ADULT_INCOME_NAME)

## California Housing

In [10]:
train_with_dataset(CALIFORNIA_HOUSING_NAME)

## Arboviruses

In [11]:
train_with_dataset(ARBOVIRUSES_NAME)

## Covertype

In [12]:
train_with_dataset(COVERTYPE_NAME)

## HIGGS

In [13]:
# train_with_dataset(HIGGS_NAME)