# Data Format

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_columns', None)

In [None]:
file = "../estudo_inicial/bases_de_dados/base_de_dados_formatada_rim.csv"

df = pd.read_csv(file)
df = df.drop(
    columns=[
        "delta_t",
        "data_de_nascimento",
        "data_do_obito",
        "data_do_transplante",
        "tipo_de_doador",
        "regiao",
        "data_da_inscricao",
        "data_do_evento",
    ]
)
df.info()

In [None]:
df.head()

In [None]:
df_bin = df

bin_cols = [
    "uf",
    "uf_origem",
    "sexo",
    "grupo_sanguineo",
    "cor",
]

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False)

for i,col in enumerate(bin_cols):

    # integer encode
    integer_encoded = label_encoder.fit_transform(df[col])

    _ , index = np.unique(integer_encoded,return_index=True)

    categories = [ df[col].values[i] for i in  index]

    # binary encode
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    # Renomeando colunas
    new_columns = [
        col + "_" + category.strip().replace(" ", "_") for category in categories
    ]

    df_one = pd.DataFrame(onehot_encoded,columns = new_columns)

    df_bin = pd.concat([df_bin, df_one], axis=1)

    df_bin = df_bin.drop(columns=col)

df_bin.info()

# Balanceamento de dados

In [None]:
from sklearn.model_selection import train_test_split

def balance(X,y,ids):

    X_0 = X[y==0]
    X_1 = X[y==1]

    y_0 = y[y==0]
    y_1 = y[y==1]
    
    ids_0 = ids[y==0]
    ids_1 = ids[y==1]

    n_obitos = len(X_0)

    X_1_bal,_, y_1_bal,_,ids_1_bal,_ = train_test_split(
            X_1, y_1 ,ids_1, train_size=n_obitos+53
        )

    
    X_bal = pd.DataFrame(np.vstack([X_0,X_1_bal]), columns=X.columns)
    y_bal = np.hstack((y_0,y_1_bal))
    ids_bal = np.hstack((ids_0,ids_1_bal))
    
    print("There are {} samples of label 0".format(n_obitos))
    print("There are {} samples of label 1".format(len(X_1_bal)))
    print("The number of balanced samples is {}".format(len(X_bal)))

    return X_bal, y_bal , ids_bal


# K-fold

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import pickle as pkl
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

norm_df = df_bin

norm_df.idade = scaler.fit_transform(df_bin.idade.values.reshape((-1, 1)))

kf = StratifiedKFold(n_splits=5)

X = norm_df.drop(columns=["transplante_bin", "obito_bin","id"])
ids = norm_df[["id"]].values
y = norm_df[["transplante_bin"]].values

X_balanced, y_balanced, ids_balanced = balance(X,y,ids)

X_balanced.head()

In [None]:


for i, (train_index, test_index) in enumerate(kf.split(X_balanced, y_balanced, ids_balanced)):

    fold_dict = {
        "X_train": X_balanced.iloc[train_index, :],
        "X_test": X_balanced.iloc[test_index, :],
        "y_train": y_balanced[train_index],
        "y_test": y_balanced[test_index],
        "ids_train": ids_balanced[train_index],
        "ids_test": ids_balanced[test_index],
    }

    with open("folds/fold_{}.pkl".format(i), "wb") as f:
        pkl.dump(fold_dict, f)

# NN model creator

In [None]:
import tensorflow as tf

tf.get_logger().setLevel("INFO")
tf.autograph.set_verbosity(0)

from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import time
from keras import backend as K
import pickle as pkl

In [None]:
class MLP_builder:
    @classmethod
    def add_layer(cls, neurons, ac_function, mlp):
        mlp = tf.keras.layers.Dense(units=neurons, activation=ac_function)(mlp)
        mlp = tf.keras.layers.Dropout(0.2)(mlp)
        return mlp

    def __init__(
        self,
        layers,
        ac_function,
        input_sequence_length,
        batch,
        iterr,
        learningRate,
        beta1,
        beta2,
        epocas,
        output_function="linear",
        shuffle=True,
    ):
        self.layers = layers
        self.ac_function = ac_function
        self.input_sequence_length = input_sequence_length
        self.batch = batch
        self.iterr = iterr
        self.learningRate = learningRate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epocas = epocas
        self.shuffle = shuffle
        self.output_function = output_function

    def fit_predict(self, X_train, y_train, X_test, struct_name, val_split=0.1):
        # print(input_sequence_length)
        input_layer = tf.keras.layers.Input(shape=(self.input_sequence_length,))

        mlp = input_layer  # Inicializar o modelo com a camada de entrada

        for neurons in self.layers:
            self.add_layer(neurons, self.ac_function, mlp)

        # Camada de saída com output_dim=1

        output_function = tf.keras.layers.Dense(1, activation=self.output_function)(mlp)

        # Definir o modelo
        mlp_model = tf.keras.Model(inputs=input_layer, outputs=output_function)

        opt = tf.keras.optimizers.Adam(
            learning_rate=self.learningRate, beta_1=self.beta1, beta_2=self.beta2
        )

        # Compilar o modelo
        mlp_model.compile(loss="binary_crossentropy", optimizer=opt)

        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=val_split, shuffle=self.shuffle, stratify=None
        )

        early_stopping_callback = tf.keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=self.iterr,
            verbose=0,
            min_delta=0.001,
            mode="min",
            restore_best_weights=True,
        )

        start_fit_time = time.time()

        history = mlp_model.fit(
            x=X_train,
            y=y_train,
            batch_size=int(self.batch),
            epochs=self.epocas,
            verbose=0,
            callbacks=[early_stopping_callback],
            # validation_split=0.0,
            validation_data=(X_val, y_val),
            shuffle=self.shuffle,
            workers=10,
            use_multiprocessing=True,
        ).history

        end_fit_time = time.time()

        fit_time = end_fit_time - start_fit_time

        # Ploting Model Loss

        plt.ioff()

        fig, ax = plt.subplots()
        plt.plot(history["loss"], linewidth=2, label="Train")
        plt.plot(history["val_loss"], linewidth=2, label="Validation")
        plt.legend(loc="upper right")
        plt.title("Model loss")
        plt.ylabel("Loss")
        plt.xlabel("Epoch")

        plt.show(block=True)

        fig.savefig(
            "learning_curves/model-loss__" + struct_name + "__.png",
            bbox_inches="tight",
        )

        del fig

        start_predict_time = time.time()

        output = [value[0] for value in mlp_model.predict(X_test, workers=10, use_multiprocessing=True)]

        end_predict_time = time.time()

        predict_time = end_predict_time - start_predict_time

        K.clear_session()

        return output, fit_time, predict_time

# Grid-search

In [None]:
import pickle
import glob
import pandas as pd


hidden_layer_sizes = [
    (3, 3),
    (5, 5),
    (10, 10),
    (3, 3, 3),
    (5, 5, 5),
    (10, 10, 10),
]
activation = [
    "sigmoid",
    "tanh",
    "linear",
    "relu",
    "softmax",
    # "leaky_relu",
    # "softplus",
    # "softsign",
    # "elu",
]

output_function_list = [
    "sigmoid",
    "tanh",
    "linear",
    "relu",
    "softmax",
    # "leaky_relu",
    # "softplus",
    # "softsign",
    # "elu",
]

batch_size = [200, 300, 400]
n_iter_no_change = [10]
learningRates = [
    0.001,
    0.0001,
    # 0.00001,
]
beta_1_List = [
    0.99,
    0.9,
    0.85,
]
beta_2_List = [
    0.99,
    0.9,
    0.85,
]

results_list = glob.glob("grid_results/*")
folds = glob.glob("folds/*")


for output_function in output_function_list:
    for function in activation:
        for batch in batch_size:
            for iterr in n_iter_no_change:
                for learningRate in learningRates:
                    for beta1 in beta_1_List:
                        for beta2 in beta_2_List:
                            for layers in hidden_layer_sizes:
                                hp_str = "grid_results/sem_tp_doador_layers_{}__activation_function_{}__output_function_{}__batch_{}__iter_{}__LearnR_{}__Beta1_{}__Beta2_{}.pkl".format(
                                    layers,
                                    function,
                                    output_function,
                                    batch,
                                    iterr,
                                    learningRate,
                                    beta1,
                                    beta2,
                                )

                                epocas = 150

                                if not hp_str in results_list:
                                    # try:
                                    global_fit_time = []
                                    global_predict_time = []
                                    model = MLP_builder(
                                        layers,
                                        function,
                                        63,
                                        batch,
                                        iterr,
                                        learningRate,
                                        beta1,
                                        beta2,
                                        epocas=epocas,
                                        output_function=output_function,
                                    )

                                    for i, fold in enumerate(folds):
                                        with open(fold, "rb") as f:
                                            fold_dict = pickle.load(f)

                                        X_train = fold_dict["X_train"]
                                        y_train = fold_dict["y_train"]

                                        X_test = fold_dict["X_test"]
                                        y_test = fold_dict["y_test"]
                                        ids_test = fold_dict["ids_test"]

                                        (
                                            output,
                                            fit_time,
                                            predict_time,
                                        ) = model.fit_predict(
                                            X_train,
                                            y_train,
                                            X_test,
                                            (
                                                hp_str.split("/")[-1]
                                                + "__fold_{}".format(i)
                                            ),
                                        )

                                        global_fit_time.append(fit_time)
                                        global_predict_time.append(predict_time)

                                        diff = [
                                                    out - tg
                                                    for out, tg in zip(output, y_test)
                                                ]

                                        if i == 0:
                                            global_output = output
                                            global_target = y_test
                                            global_dict = {
                                                "id": [id for id in ids_test],
                                                "rse": diff,
                                            }

                                        else:
                                            global_output = np.hstack(
                                                (global_output, output)
                                            )
                                            global_target = np.hstack(
                                                (global_target, y_test)
                                            )
                                            global_dict["id"] += [id for id in ids_test]
                                            global_dict["rse"] += diff

                                    result = {
                                        "y_predict": global_output,
                                        "y_test": global_target,
                                        "global_dict": global_dict,
                                        "fit_time": [np.mean(global_fit_time)],
                                        "predict_time": [np.mean(global_predict_time)],
                                    }

                                    with open(hp_str, "wb") as handle:
                                        pkl.dump(result, handle)

                                    # except:
                                    #     pass

In [None]:
for i, fold in enumerate(folds):
    with open(fold, "rb") as f:
        fold_dict = pickle.load(f)

    X_train = fold_dict["X_train"]
    y_train = fold_dict["y_train"]
    X_test = fold_dict["X_test"]
    y_test = fold_dict["y_test"]

    X_train.head()