In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, Dropout, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization, Activation
from tensorflow.keras.optimizers.schedules import ExponentialDecay

### Data Configuration

In [None]:
df1 = pd.read_excel('C:/users/a.aybarf/Downloads/dfMLplp2024.xlsx', index_col=0)

In [None]:
len(df1)

In [None]:
df1.columns

In [None]:
df1 = df1.drop(['trabajo_mp'], axis=1)

In [None]:
df1.columns = ['plp',                 # Poverty Line Percentage,
    'urban_rural',         # 'rururb' = rural or urban\n",
    'age',                 # 'edad'\n",
    'sex',                 # 'sexo'\n",
    'literacy',            # 'alfabetism'\n",
    'food_insecurity',     # 'ins_ali' = food insecurity\n",
    'hli',                 # hli
    'public_healthcare',   # social health insurance",
    'medical_attention',   # 'atemed'\n",
    'bank_card',           # 'tarjeta' = debit/credit/bank card\n",
    'electricity',  # 'disp_elect' = electronic devices availability\n",
    'total_residents',     # 'tot_resid'\n",
    'region',              # 'región'\n",
    'connectivity',        # 'conectividad' = internet or digital connectivity\n",
    'water_drainage',      # 'agua_drenaje'\n",
    'household_head_edu',  # 'neducativojefe' = educational level of household head\n",
    'child_labor',         # 'trabajomenores'\n",
    'children',            # 'niños'\n",
    'household_occupation', # 'ocupacion_hogar' = economic activity of the household\n",
    'consumption expenditure',
    'basic_energy_equipment',
    'housing_tenure'
   ]

In [None]:
df1.columns

In [None]:
X = df1[['urban_rural', 'age', 'sex', 'literacy',
       'food_insecurity', 'hli', 'public_healthcare', 'medical_attention',
       'bank_card', 'electricity', 'total_residents', 'region', 'connectivity',
       'water_drainage', 'household_head_edu', 'child_labor', 'children',
       'household_occupation', 'consumption expenditure', 'basic_energy_equipment', 'housing_tenure'
       ]]
y = df1[['plp']]

### CNN Optimizacion During Construction

In [None]:
# Split the data into training and test sets
X_train1, X_test1, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #hacerle reshape a todo (train, test y validation)

X_train = X_train1.reset_index(drop=True)
y_train = y_train.reset_index(drop=True) #dividir a y_0 y y_1

X_test_original = X_test1
X_test_originalnumpy = X_test1.to_numpy()

Xtrainfinal1 = X_train.to_numpy()
ytrainfinal1 = y_train.to_numpy()
X_test1 = X_test1.to_numpy()
y_test1 = y_test.to_numpy()

X_trainP=tf.reshape(Xtrainfinal1,(Xtrainfinal1.shape[0], Xtrainfinal1.shape[1],1))

X_test1=tf.reshape(X_test1,(X_test1.shape[0], X_test1.shape[1],1))

In [None]:
initial_learning_rate = 0.01
lr_schedule = ExponentialDecay(initial_learning_rate, decay_steps=200, decay_rate=0.9)

In [None]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import AveragePooling1D

#### Filter Values Testing

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# Lista de configuraciones de filtros para los modelos (Tabla 6)
filter_configs = [
    {'conv_filters': [75, 50, 25], 'dense_units': [25, 1]},
    {'conv_filters': [50, 25, 10], 'dense_units': [10, 1]},
    {'conv_filters': [30, 15, 5],  'dense_units': [10, 1]},
    {'conv_filters': [20, 10, 5],  'dense_units': [5, 1]},
    {'conv_filters': [100, 50, 25],'dense_units': [20, 1]},
    {'conv_filters': [120, 100, 120],'dense_units': [40, 1]},
    {'conv_filters': [150, 100, 50],'dense_units': [25, 1]},
    {'conv_filters': [20, 10, 5],  'dense_units': [10, 1]}
]

for model_idx, config in enumerate(filter_configs, start=1):
    print(f'\nTesting Model {model_idx} with filters: {config["conv_filters"]} and dense units: {config["dense_units"]}')

    def create_cnn_model(X_trainP):
        model = Sequential()
        # Capas convolucionales y de pooling
        model.add(Conv1D(filters=config['conv_filters'][0], kernel_size=3, activation='relu', input_shape=(X_trainP.shape[1], X_trainP.shape[2]), padding='same'))
        model.add(Conv1D(filters=config['conv_filters'][1], kernel_size=3, activation='relu', padding='same'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=config['conv_filters'][2], kernel_size=3, activation='relu', padding='same'))
        model.add(GlobalAveragePooling1D())
        # Capas densas
        model.add(Dense(config['dense_units'][0], activation='relu'))
        model.add(Dense(config['dense_units'][1], activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
        return model

    for i in range(10):  # 5 repeticiones
        best_model = None
        best_accuracy = 0.0
        counter = 1
        accuracyacumulada = 0.0

        for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):
            dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
            dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

            X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
            y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

            X_trainP = X_trainP.to_numpy()
            X_val = X_val.to_numpy()
            y_trainP = y_trainP.to_numpy()
            y_val = y_val.to_numpy()

            X_trainP = tf.reshape(X_trainP, (X_trainP.shape[0], X_trainP.shape[1], 1))
            X_val = tf.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))

            model = create_cnn_model(X_trainP)
            history = model.fit(X_trainP, y_trainP, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)

            y_val_pred = model.predict(X_val)
            y_val_pred = (y_val_pred > 0.5).astype(int)
            accuracy = accuracy_score(y_val, y_val_pred)

            counter += 1
            accuracyacumulada += accuracy

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_history = history

        print(f'{i+1}: Model {model_idx} - Best Validation Accuracy: {best_accuracy * 100:.2f}%')

        y_test_pred = best_model.predict(X_test1)
        y_test_pred = (y_test_pred > 0.5).astype(int)
        test_accuracy = accuracy_score(y_test1, y_test_pred)
        print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

    print('####################################################')

#### Kernel Size

In [None]:
kernellist = [2,3,5,7,11]

for kernelsize in kernellist:
    print(f'Test for Kernel Size: {kernelsize}')
    # Define the CNN model
    def create_cnn_model(X_trainP):
        model = Sequential()
        # Capas convolucionales y de pooling
        model.add(Conv1D(filters=50, kernel_size=kernelsize, activation='relu', input_shape=(X_trainP.shape[1], X_trainP.shape[2]), padding='same'))
        model.add(Conv1D(filters=25, kernel_size=kernelsize, activation='relu', padding='same'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=10, kernel_size=kernelsize, activation='relu', padding='same'))
        model.add(GlobalAveragePooling1D())
        #Capas densas
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy']) #sparse_categorical_crossentropy (si es que no funciona la división de y con one-hot encoding)
        return model

    for i in range(0,10):
        best_model = None
        best_accuracy = 0.0
        counter = 1
        accuracyacumulada = 0.0
        # Perform 10-fold cross-validation
        for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):

            dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
            dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

            X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
            y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

            X_trainP = X_trainP.to_numpy()
            X_val = X_val.to_numpy()
            y_trainP = y_trainP.to_numpy()
            y_val = y_val.to_numpy()

            X_trainP=tf.reshape(X_trainP,(X_trainP.shape[0], X_trainP.shape[1],1))
            X_val=tf.reshape(X_val,(X_val.shape[0], X_val.shape[1],1))

            model = create_cnn_model(X_trainP)
            history = model.fit(X_trainP, y_trainP, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0) #100 epochs
            #history
            #(sacarlo primero del for para probar la arquitecutra y luego incluirlo)

            # Evaluate the model on the validation set
            y_val_pred = model.predict(X_val)
            y_val_pred = (y_val_pred > 0.5).astype(int)
            accuracy = accuracy_score(y_val, y_val_pred)
            #print(f'Validation Accuracy {counter}: {accuracy * 100:.2f}%')

            counter += 1
            accuracyacumulada += accuracy

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_history = history

        #print(' ')
        #print(f'Average Validation Accuracy: {(accuracyacumulada/10) * 100:.2f}%')
        print(f'{i+1}: {kernelsize}')

        print(f'Best Validation Accuracy: {best_accuracy * 100:.2f}%')

        # Make predictions on the test set using the best model
        y_test_pred = best_model.predict(X_test1)

        y_test_pred = (y_test_pred > 0.5).astype(int)

        # Evaluate the performance on the test set
        test_accuracy = accuracy_score(y_test1, y_test_pred)
        print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    print('####################################################')

#### CNN Arquitecture

In [None]:
architectures = {
    'A1': lambda input_shape: build_model_A1(input_shape),
    'A2': lambda input_shape: build_model_A2(input_shape),
    'A3': lambda input_shape: build_model_A3(input_shape),
    'A4': lambda input_shape: build_model_A4(input_shape),
    'A5': lambda input_shape: build_model_A5(input_shape, 0.1),
    'A6': lambda input_shape: build_model_A5(input_shape, 0.2),
    'A7': lambda input_shape: build_model_A7(input_shape, 0.1),
    'A8': lambda input_shape: build_model_A8_A10(input_shape, 0.1, use_batchnorm=True, use_flatten=True, use_avgpool=False),
    'A9': lambda input_shape: build_model_A8_A10(input_shape, 0.1, use_batchnorm=True, use_flatten=False, use_avgpool=True),
    'A10': lambda input_shape: build_model_A8_A10(input_shape, 0.1, use_batchnorm=True, use_flatten=True, use_avgpool=True)
}

# Funciones para construir cada arquitectura
def build_model_A1(input_shape):
    model = Sequential([
        Conv1D(50, 7, activation='relu', padding='same', input_shape=input_shape),
        Conv1D(25, 7, activation='relu', padding='same'),
        MaxPooling1D(2),
        Conv1D(10, 7, activation='relu', padding='same'),
        GlobalAveragePooling1D(),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A2(input_shape):
    model = Sequential([
        Conv1D(50, 7, activation='relu', padding='same', input_shape=input_shape),
        Conv1D(25, 7, activation='relu', padding='same'),
        Conv1D(50, 7, activation='relu', padding='same'),
        Conv1D(25, 7, activation='relu', padding='same'),
        MaxPooling1D(2),
        Conv1D(10, 7, activation='relu', padding='same'),
        GlobalAveragePooling1D(),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A3(input_shape):
    model = Sequential([
        Conv1D(50, 7, activation='relu', padding='same', input_shape=input_shape),
        Conv1D(25, 7, activation='relu', padding='same'),
        MaxPooling1D(2),
        Conv1D(50, 7, activation='relu', padding='same'),
        GlobalAveragePooling1D(),
        Dense(30, activation='relu'),
        Dense(20, activation='relu'),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A4(input_shape):
    model = Sequential([
        Conv1D(50, 7, activation='relu', padding='same', input_shape=input_shape),
        Conv1D(25, 7, activation='relu', padding='same'),
        Conv1D(50, 7, activation='relu', padding='same'),
        Conv1D(50, 7, activation='relu', padding='same'),
        MaxPooling1D(2),
        Conv1D(25, 7, activation='relu', padding='same'),
        GlobalAveragePooling1D(),
        Dense(30, activation='relu'),
        Dense(20, activation='relu'),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A5(input_shape, dropout_rate):
    model = Sequential([
        Conv1D(50, 7, activation='relu', padding='same', input_shape=input_shape),
        Conv1D(25, 7, activation='relu', padding='same'),
        MaxPooling1D(2),
        Conv1D(10, 7, activation='relu', padding='same'),
        GlobalAveragePooling1D(),
        Dense(10, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A7(input_shape, dropout_rate):
    model = Sequential([
        Conv1D(50, 7, padding='same', input_shape=input_shape),
        BatchNormalization(), Activation('relu'),
        Conv1D(25, 7, padding='same'),
        BatchNormalization(), Activation('relu'),
        MaxPooling1D(2),
        Conv1D(10, 7, padding='same'),
        BatchNormalization(), Activation('relu'),
        GlobalAveragePooling1D(),
        Dense(10), BatchNormalization(), Activation('relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

def build_model_A8_A10(input_shape, dropout_rate, use_batchnorm=True, use_flatten=False, use_avgpool=False):
    model = Sequential()
    model.add(Conv1D(50, 7, padding='same', input_shape=input_shape))
    if use_batchnorm: model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(Conv1D(25, 7, padding='same'))
    if use_batchnorm: model.add(BatchNormalization())
    model.add(Activation('relu'))

    if use_avgpool:
        model.add(AveragePooling1D(pool_size=2))
    else:
        model.add(MaxPooling1D(pool_size=2))

    model.add(Conv1D(10, 7, padding='same'))
    if use_batchnorm: model.add(BatchNormalization())
    model.add(Activation('relu'))

    if use_flatten:
        model.add(Flatten())
    else:
        model.add(GlobalAveragePooling1D())

    model.add(Dense(10))
    if use_batchnorm: model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])
    return model

In [None]:
for arch_name, build_fn in architectures.items():
    print(f'\nTesting Architecture {arch_name}')

    for i in range(10):
        best_model = None
        best_accuracy = 0.0
        accuracyacumulada = 0.0

        for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):
            dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
            dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

            X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
            y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

            X_trainP = tf.reshape(X_trainP.to_numpy(), (len(X_trainP), X_trainP.shape[1], 1))
            X_val = tf.reshape(X_val.to_numpy(), (len(X_val), X_val.shape[1], 1))
            y_trainP = y_trainP.to_numpy()
            y_val = y_val.to_numpy()

            model = build_fn((X_trainP.shape[1], X_trainP.shape[2]))
            history = model.fit(X_trainP, y_trainP, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)

            y_val_pred = (model.predict(X_val) > 0.5).astype(int)
            accuracy = accuracy_score(y_val, y_val_pred)
            accuracyacumulada += accuracy

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model

        print(f'{i+1}: {arch_name} - Best Validation Accuracy: {best_accuracy * 100:.2f}%')
        y_test_pred = (best_model.predict(X_test1) > 0.5).astype(int)
        test_accuracy = accuracy_score(y_test1, y_test_pred)
        print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

    print('####################################################')

### CNN Optimizacion During Training

In [None]:
# Define the CNN model
def create_cnn_model(Xtrainfinal1):
    model = Sequential()
    # Capas convolucionales y de pooling
    model.add(Conv1D(filters=50, kernel_size=7, activation='relu', input_shape=(Xtrainfinal1.shape[1], Xtrainfinal1.shape[2]), padding='same'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=25, kernel_size=7, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=10, kernel_size=7, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Flatten())
    #Capas densas
    model.add(Dense(10, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy']) #sparse_categorical_crossentropy (si es que no funciona la división de y con one-hot encoding)
    return model

#### Epoch Values Testing

In [None]:
for epovalue in [25,50,100,150,200]:
    print(f'Test for Epoch: {epovalue}')
    for i in range(0,10):
        best_model = None
        best_accuracy = 0.0
        counter = 1
        accuracyacumulada = 0.0
        # Perform 10-fold cross-validation
        for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):

            dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
            dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

            X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
            y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

            X_trainP = X_trainP.to_numpy()
            X_val = X_val.to_numpy()
            y_trainP = y_trainP.to_numpy()
            y_val = y_val.to_numpy()

            X_trainP=tf.reshape(X_trainP,(X_trainP.shape[0], X_trainP.shape[1],1))
            X_val=tf.reshape(X_val,(X_val.shape[0], X_val.shape[1],1))

            model = create_cnn_model(X_trainP)
            history = model.fit(X_trainP, y_trainP, epochs=epovalue, batch_size=32, validation_data=(X_val, y_val), verbose=0)
            history
            #(sacarlo primero del for para probar la arquitecutra y luego incluirlo)

            # Evaluate the model on the validation set
            y_val_pred = model.predict(X_val)
            y_val_pred = (y_val_pred > 0.5).astype(int)
            accuracy = accuracy_score(y_val, y_val_pred)
            #print(f'Validation Accuracy {counter}: {accuracy * 100:.2f}%')

            counter += 1
            accuracyacumulada += accuracy

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_history = history

        #print(' ')
        #print(f'Average Validation Accuracy: {(accuracyacumulada/10) * 100:.2f}%')
        print(f'{i+1}: {epovalue}')

        print(f'Best Validation Accuracy: {best_accuracy * 100:.2f}%')

        # Make predictions on the test set using the best model
        y_test_pred = best_model.predict(X_test1)

        y_test_pred = (y_test_pred > 0.5).astype(int)

        # Evaluate the performance on the test set
        test_accuracy = accuracy_score(y_test1, y_test_pred)
        print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    print('####################################################')

#### Batch Sizes Values Testing

In [None]:
for batchvalue in [32,64,96,128,160]:
    print(f'Test for Batch Size: {batchvalue}')
    for i in range(0,10):
        best_model = None
        best_accuracy = 0.0
        counter = 1
        accuracyacumulada = 0.0
        # Perform 10-fold cross-validation
        for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):

            dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
            dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

            X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
            y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

            X_trainP = X_trainP.to_numpy()
            X_val = X_val.to_numpy()
            y_trainP = y_trainP.to_numpy()
            y_val = y_val.to_numpy()

            X_trainP=tf.reshape(X_trainP,(X_trainP.shape[0], X_trainP.shape[1],1))
            X_val=tf.reshape(X_val,(X_val.shape[0], X_val.shape[1],1))

            model = create_cnn_model(X_trainP)
            history = model.fit(X_trainP, y_trainP, epochs=25, batch_size=batchvalue, validation_data=(X_val, y_val), verbose=0) #100 epochs
            #history
            #(sacarlo primero del for para probar la arquitecutra y luego incluirlo)

            # Evaluate the model on the validation set
            y_val_pred = model.predict(X_val)
            y_val_pred = (y_val_pred > 0.5).astype(int)
            accuracy = accuracy_score(y_val, y_val_pred)
            #print(f'Validation Accuracy {counter}: {accuracy * 100:.2f}%')

            counter += 1
            accuracyacumulada += accuracy

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_history = history

        #print(' ')
        #print(f'Average Validation Accuracy: {(accuracyacumulada/10) * 100:.2f}%')
        print(f'{i+1}: {batchvalue}')

        print(f'Best Validation Accuracy: {best_accuracy * 100:.2f}%')

        # Make predictions on the test set using the best model
        y_test_pred = best_model.predict(X_test1)

        y_test_pred = (y_test_pred > 0.5).astype(int)

        # Evaluate the performance on the test set
        test_accuracy = accuracy_score(y_test1, y_test_pred)
        print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    print('####################################################')

### Final CNN Model Evaluation

In [None]:
# Split the data into training and test sets
X_train1, X_test1, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #hacerle reshape a todo (train, test y validation)

X_train = X_train1.reset_index(drop=True)
y_train = y_train.reset_index(drop=True) #dividir a y_0 y y_1

Xtrainfinal1 = X_train.to_numpy()
ytrainfinal1 = y_train.to_numpy()
X_test1 = X_test1.to_numpy()
y_test1 = y_test.to_numpy()

X_test1=tf.reshape(X_test1,(X_test1.shape[0], X_test1.shape[1],1))

In [None]:
# Initialize StratifiedKFold for 10-fold cross-validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)

In [None]:
initial_learning_rate = 0.01
lr_schedule = ExponentialDecay(initial_learning_rate, decay_steps=200, decay_rate=0.9)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, accuracy_score

In [None]:
# Define the CNN model
def create_cnn_model(Xtrainfinal1):
    model = Sequential()
    # Capas convolucionales y de pooling
    model.add(Conv1D(filters=50, kernel_size=7, activation='relu', input_shape=(Xtrainfinal1.shape[1], Xtrainfinal1.shape[2]), padding='same'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=25, kernel_size=7, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=10, kernel_size=7, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Flatten())
    #Capas densas
    model.add(Dense(10, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy']) #sparse_categorical_crossentropy (si es que no funciona la división de y con one-hot encoding)
    return model

best_model = None
best_accuracy = 0.0
counter = 1
accuracyacumulada = 0.0

# Perform 10-fold cross-validation
for train_idx, val_idx in kfold.split(Xtrainfinal1, ytrainfinal1):

    dfX = pd.DataFrame(Xtrainfinal1, columns=X.columns)
    dfy = pd.DataFrame(ytrainfinal1, columns=y.columns)

    X_trainP, X_val = dfX.loc[train_idx], dfX.loc[val_idx]
    y_trainP, y_val = dfy.loc[train_idx], dfy.loc[val_idx]

    X_trainP = X_trainP.to_numpy()
    X_val = X_val.to_numpy()
    y_trainP = y_trainP.to_numpy()
    y_val = y_val.to_numpy()

    X_trainP=tf.reshape(X_trainP,(X_trainP.shape[0], X_trainP.shape[1],1))
    X_val=tf.reshape(X_val,(X_val.shape[0], X_val.shape[1],1))

    model = create_cnn_model(X_trainP)
    history = model.fit(X_trainP, y_trainP, epochs=25, batch_size=128, validation_data=(X_val, y_val), verbose=0)
    #history
    #(sacarlo primero del for para probar la arquitecutra y luego incluirlo)

    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)
    y_val_pred = (y_val_pred > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_val_pred)
    print(f'Validation Accuracy {counter}: {accuracy * 100:.2f}%')

    counter += 1
    accuracyacumulada += accuracy

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_history = history
        bestcm = confusion_matrix(y_val, y_val_pred)
        best_precision = precision_score(y_val, y_val_pred)
        bestrecall = recall_score(y_val, y_val_pred)
        bestspecificity = bestcm[0, 0] / (bestcm[0, 0] + bestcm[0, 1])
        bestf1 = f1_score(y_val, y_val_pred)
        fpr, tpr, _ = roc_curve(y_val, y_val_pred)
        bestroc_auc = auc(fpr, tpr)


print(' ')
print(f'Average Validation Accuracy: {(accuracyacumulada/10) * 100:.2f}%')

In [None]:
print(f'Best Validation Accuracy: {best_accuracy * 100:.2f}%')
print(f'Best Validation Precision: {best_precision * 100:.2f}%')
print(f'Best Validation Recall: {bestrecall * 100:.2f}%')
print(f'Best Validation Specificity: {bestspecificity * 100:.2f}%')
print(f'Best Validation F1: {bestf1:.2f}%')
print(f'Best Validation AUC: {bestroc_auc:.2f}%')

In [None]:
# Get the current learning rate from the optimizer
current_learning_rate = tf.keras.backend.get_value(model.optimizer.learning_rate)
print("Current Learning Rate:", current_learning_rate)

In [None]:
import pydot
import graphviz
from tensorflow.keras.utils import plot_model

In [None]:
import matplotlib.pyplot as plt

# Plotting both training accuracy and loss in a single graph
plt.figure(figsize=(12, 6))

# Plot training accuracy and loss in the same graph
plt.plot(best_history.history['accuracy'], label='Train Accuracy')
plt.plot(best_history.history['loss'], label='Train Loss')

plt.title('Model Accuracy and Loss')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend(loc='upper left')

In [None]:
# Make predictions on the test set using the best model
y_test_pred = best_model.predict(X_test1)

y_test_pred = (y_test_pred > 0.5).astype(int)

# Evaluate the performance on the test set
test_accuracy = accuracy_score(y_test1, y_test_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

confusion = confusion_matrix(y_test1, y_test_pred)
confusion

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Visualize confusion matrix using seaborn
sns.set(font_scale=1.2)
plt.figure(figsize=(6, 6))
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues", cbar=False,
            annot_kws={"size": 15}, xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate confusion matrix
cm = confusion_matrix(y_test1, y_test_pred)
# Calculate accuracy
accuracy = accuracy_score(y_test1, y_test_pred)
# Calculate precision
precision = precision_score(y_test1, y_test_pred)
# Calculate recall (sensitivity)
recall = recall_score(y_test1, y_test_pred)
# Calculate specificity
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
# Calculate F1-score
f1 = f1_score(y_test1, y_test_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1-score: {f1:.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np

# Calcula el AUC y grafica la curva ROC
def plot_roc_curve(y_true, y_scores, title="ROC Curve"):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()
    return roc_auc

# Evaluar AUC en el conjunto de validación y graficar para la última iteración del cross-validation
y_val_scores = best_model.predict(X_val).ravel()  # Probabilidades para el conjunto de validación
auc_val = plot_roc_curve(y_val, y_val_scores, title="ROC Curve (Validation Set)")

# Evaluar AUC en el conjunto de prueba
y_test_scores = best_model.predict(X_test1).ravel()  # Probabilidades para el conjunto de prueba
auc_test = plot_roc_curve(y_test1, y_test_scores, title="ROC Curve (Test Set)")

print(f'Validation AUC: {auc_val:.2f}')
print(f'Test AUC: {auc_test:.2f}')