In [1]:
import sys
import os
# Get the directory path of the current script
current_script_directory = os.path.dirname(os.path.abspath(__file__))
# Construct the path to the src directory
src_directory = os.path.join(current_script_directory, "..", "src")
srcpro_directory = os.path.join(current_script_directory, "..", "src/propythia")

# Add the src directory to sys.path
sys.path.append(src_directory)
sys.path.append(srcpro_directory)

# Quickstart Protein encodings and DL
This jupyter notebook will demonstrate how to obtain protein encodings and use DL models to classify sequences. 
We will use the same dataset of antimicrobial peptides as the other protein notebooks. 

In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO

from propythia.protein.sequence import ReadSequence
from propythia.protein.encoding import Encoding

2023-08-25 12:15:48.826092: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  warn(f"Failed to load image Python extension: {e}")


## 1. Get data

First, lets get the data. 

We will use the Antimicrobial peptides case study using Veltri study.

The collection of data is available at
https://www.dveltri.com/ascan/v2/news.html

D. Veltri, U. Kamath, A. Shehu, Deep learning improves antimicrobial
peptide recognition, Bioinformatics 34 (16) (2018) 2740{2747. doi:10.1093/bioinformatics/bty179.e

In [3]:
amps_file = './AMP_Scan2_Feb2020_Dataset/AMPS_02182020.fasta'
non_amps_file = './AMP_Scan2_Feb2020_Dataset/DECOYS_02182020.fasta'


sequences = SeqIO.parse(amps_file, "fasta")
data = []

for record in sequences:
    data.append([record.id, str(record.seq), 1])

sequences = SeqIO.parse(non_amps_file, "fasta")
for record in sequences:
    data.append([record.id, str(record.seq), 0])

df = pd.DataFrame(data, columns=["ID", "sequence", 'label'])
print(df.head())

        ID                                          sequence  label
0  AP02484                             GMASKAGSVLGKITKIALGAL      1
1  AP02630       NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC      1
2  AP01427                    GAIKDALKGAAKTVAVELLKKAQCKLEKTC      1
3  AP02983                             FFGRLKAVFRGARQGWKEHRY      1
4  AP01815  DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG      1


In [4]:
# Get the maximum and minimum length of strings in the specified column
max_length = df["sequence"].str.len().max()
min_length = df["sequence"].str.len().min()

print(f"Maximum length: {max_length}")
print(f"Minimum length: {min_length}")

Maximum length: 183
Minimum length: 11


The module ReadSequence contains functions built to preprocess the protein sequences, by replacing or remove certain amino acids. 

The function "par_preprocessing" was designed to deal with pandas dataframes (it is required to specify the atribute dataset and the column of protein sequences) while the function "get_preprocessing" was designed to process only one sequence.

The preprocessing phase may be required to calculate certain descriptors features or encodings.

In [5]:
read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset= df, col = 'sequence', B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')
res

Unnamed: 0,ID,sequence,label
0,AP02484,GMASKAGSVLGKITKIALGAL,1
1,AP02630,NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC,1
2,AP01427,GAIKDALKGAAKTVAVELLKKAQCKLEKTC,1
3,AP02983,FFGRLKAVFRGARQGWKEHRY,1
4,AP01815,DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG,1
...,...,...,...
4037,UniRef50_C5DJ44,SSGNVNEVPKQNAKHPMDSCQNLEQSAGTTSAEKEAIRALESQSSG...,0
4038,UniRef50_Q9XUP3,ESCNFAVFWKLVKGAYKPTTNPNEPFKVPGEVPKMIKPMVGFEDAV...,0
4039,UniRef50_Q9Y573,VAALNDCIYSVGGWNETQDALHTVEKYSFEEEKWVEVASMKVPRAG...,0
4040,UniRef50_Q54H44,PHTHTQKEVITSSVD,0


# 2. Encode sequences
The module Encoding performs the encoding of the protein sequences.

As the ProteinDescriptors it accepts as dataset a pandas dataframe, list of sequences and a sequence (string) as dataset. The parameter col is the name of the column to store the sequences, or the column where the sequences are present (pandas dataframe).

In [6]:
enconde_df = Encoding(dataset= df ,  col= 'sequence')
enconde_df.result

Unnamed: 0,ID,sequence,label
0,AP02484,GMASKAGSVLGKITKIALGAL,1
1,AP02630,NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC,1
2,AP01427,GAIKDALKGAAKTVAVELLKKAQCKLEKTC,1
3,AP02983,FFGRLKAVFRGARQGWKEHRY,1
4,AP01815,DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG,1
...,...,...,...
4037,UniRef50_C5DJ44,SSGNVNEVPKQNAKHPMDSCQNLEQSAGTTSAEKEAIRALESQSSG...,0
4038,UniRef50_Q9XUP3,ESCNFAVFWKLVKGAYKPTTNPNEPFKVPGEVPKMIKPMVGFEDAV...,0
4039,UniRef50_Q9Y573,VAALNDCIYSVGGWNETQDALHTVEKYSFEEEKWVEVASMKVPRAG...,0
4040,UniRef50_Q54H44,PHTHTQKEVITSSVD,0


To perform the encoding operations the intended function must be called. 

It can be one-hot-encoded, NLF, Blosum, z_scale. It also can be performed a padding to all sequences in the dataframe.
Lets look at them 

In [7]:
hot_encoded = enconde_df.get_hot_encoded()
print(hot_encoded['One_hot_encoding'][0].shape)
print(hot_encoded['One_hot_encoding'])

2023-08-25 12:15:50.597914: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-25 12:15:50.597914: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-25 12:15:50.597914: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-25 12:15:50.597915: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  warn(f"Failed to load image Python extension: {e}")
  warn(f"Failed to load image Python extension: {e}")
  warn(f"Failed to load image Python extension: {e}")
  warn(f"Failed to load image Python extension: {e}")
2023-08-25 12:15:52.635252: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-25 12:15:52.948726: I tensorflow/stream_execut

(21, 21)
0       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,...
1       [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
2       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,...
3       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4       [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...
                              ...                        
4037    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4038    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...
4039    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4040    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4041    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: One_hot_encoding, Length: 4042, dtype: object


  warn(f"Failed to load image Python extension: {e}")


In [8]:
nlf = enconde_df.get_nlf()
print(np.array(nlf['nlf'][0]).shape)
print(nlf['nlf'])

(21, 18)
0       [[1.32, 2.05, 0.6, 0.31, 0.61, 0.58, 0.0, 0.3,...
1       [[1.68, 0.3, 0.49, 0.15, 0.09, 0.59, 0.06, 0.0...
2       [[1.32, 2.05, 0.6, 0.31, 0.61, 0.58, 0.0, 0.3,...
3       [[2.37, 0.23, 0.09, 0.37, 0.19, 0.04, 0.03, 0....
4       [[0.81, 0.13, 1.36, 0.63, 0.15, 0.1, 0.45, 0.3...
                              ...                        
4037    [[1.47, 1.11, 0.27, 0.13, 0.15, 0.22, 0.09, 0....
4038    [[1.56, 0.48, 0.87, 0.02, 0.07, 0.13, 0.22, 0....
4039    [[1.33, 1.39, 0.15, 0.4, 0.04, 0.27, 0.07, 0.1...
4040    [[1.41, 0.27, 1.09, 0.77, 0.87, 0.33, 0.04, 0....
4041    [[1.29, 1.21, 0.25, 0.96, 0.18, 0.06, 0.04, 0....
Name: nlf, Length: 4042, dtype: object


In [9]:
blosum = enconde_df.get_blosum()
print(np.array(blosum['blosum'][0]).shape)
print(blosum['blosum'])

(21, 23)
0       [[0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2,...
1       [[-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, ...
2       [[0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2,...
3       [[-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3...
4       [[-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, ...
                              ...                        
4037    [[1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1,...
4038    [[-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2,...
4039    [[0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2,...
4040    [[-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, ...
4041    [[-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2...
Name: blosum, Length: 4042, dtype: object


In [10]:
zscale = enconde_df.get_zscale()
print(np.array(zscale['zscale'][0]).shape)
print(zscale['zscale'])

(21, 5)
0       [[2.05, -4.06, 0.36, -0.82, -0.38], [-2.85, -0...
1       [[3.05, 1.62, 1.04, -1.15, 1.61], [-3.89, -1.7...
2       [[2.05, -4.06, 0.36, -0.82, -0.38], [0.24, -2....
3       [[-4.22, 1.94, 1.06, 0.54, -0.62], [-4.22, 1.9...
4       [[3.98, 0.93, 1.93, -2.46, 0.75], [-4.22, 1.94...
                              ...                        
4037    [[2.39, -1.07, 1.15, -1.39, 0.67], [2.39, -1.0...
4038    [[3.11, 0.26, -0.11, -0.34, -0.25], [2.39, -1....
4039    [[-2.59, -2.64, -1.54, -0.85, -0.02], [0.24, -...
4040    [[-1.66, 0.27, 1.84, 0.7, 2.0], [2.47, 1.95, 0...
4041    [[-4.28, -1.3, -1.49, -0.72, 0.84], [0.75, -2....
Name: zscale, Length: 4042, dtype: object


## 3. one hot encoding + DL

The encodings will have give to each aminoacid a vector of 21 (hot encoded), 18 (nlf), 23 (blosum)  or 5 (z scale) dimensions. 
This means that each sequence will be represented as a vector of X * sequence length. 
In the lines above we print the first sequence that has 21 aminoacids. however, different sequences will have different lengths. 
Deep learning models cannot deal with different shape inputs. 
Therefore, one must assure that all sequences have the same lengths. This is achieved padding sequences with 0s until a defined length, or truncating sequences (cut) that have more than a defined length. 


The function get_pad_and_hot_encoding allows to perform the padding and the one hot encoding of the sequence at the same time. The one-hot-encoded sequence will have the shape of (length of the sequences, number of amino acids in the alphabet).

As the antimicrobial peptides in the dataset are small. Is not very computationally expensive to padd all sequences to the max len - 183

Scaling and feature selectin is not required.


In [11]:
enconde_df = Encoding(dataset= df ,  col= 'sequence')

res = enconde_df.get_pad_and_hot_encoding(seq_len=183)
res

Unnamed: 0,ID,sequence,label,pad_seques,One_hot_encoding
0,AP02484,GMASKAGSVLGKITKIALGAL,1,GMASKAGSVLGKITKIALGALXXXXXXXXXXXXXXXXXXXXXXXXX...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
1,AP02630,NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC,1,NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSCXXX...,"[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,AP01427,GAIKDALKGAAKTVAVELLKKAQCKLEKTC,1,GAIKDALKGAAKTVAVELLKKAQCKLEKTCXXXXXXXXXXXXXXXX...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
3,AP02983,FFGRLKAVFRGARQGWKEHRY,1,FFGRLKAVFRGARQGWKEHRYXXXXXXXXXXXXXXXXXXXXXXXXX...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,AP01815,DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG,1,DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFI...,"[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...
4037,UniRef50_C5DJ44,SSGNVNEVPKQNAKHPMDSCQNLEQSAGTTSAEKEAIRALESQSSG...,0,SSGNVNEVPKQNAKHPMDSCQNLEQSAGTTSAEKEAIRALESQSSG...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4038,UniRef50_Q9XUP3,ESCNFAVFWKLVKGAYKPTTNPNEPFKVPGEVPKMIKPMVGFEDAV...,0,ESCNFAVFWKLVKGAYKPTTNPNEPFKVPGEVPKMIKPMVGFEDAV...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
4039,UniRef50_Q9Y573,VAALNDCIYSVGGWNETQDALHTVEKYSFEEEKWVEVASMKVPRAG...,0,VAALNDCIYSVGGWNETQDALHTVEKYSFEEEKWVEVASMKVPRAG...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4040,UniRef50_Q54H44,PHTHTQKEVITSSVD,0,PHTHTQKEVITSSVDXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


train test split

In [12]:
from sklearn.model_selection import train_test_split

expanded_arrays = res['One_hot_encoding'].apply(lambda x: np.array(x))
X = np.array(expanded_arrays.tolist())


# X = np.array(res['One_hot_encoding'].apply(lambda x: np.array(x[0])))
y = res['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)

train_x (2708, 183, 21)
test_x (1334, 183, 21)


Define a DL model. 
Convolutional and RNN are good choices for this problem. 

Besides that, adding callbacks such as early stopping and modelCheckpoint may be very beneficial

We first will use the tensorfow library as example and then a DL model train with Propythia will be made

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Conv1D,Conv2D, Flatten, MaxPool1D,MaxPool2D, Dropout, Input,GRU
from tensorflow.keras.layers import Embedding, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Define callbacks
checkpoint = ModelCheckpoint(filepath='best_model.h5', save_best_only=True, monitor='val_accuracy', mode='max')
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)


# CNN based model

In [14]:
model = Sequential([
    Input(shape=(183, 21)),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPool1D(pool_size=2),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPool1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, batch_size=64, epochs=100, validation_split=0.2,callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)

2023-08-25 12:15:58.477915: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-25 12:15:58.478424: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-08-25 12:15:58.521453: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-25 12:15:58.521579: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.545GHz coreCount: 68 deviceMemorySize: 10.75GiB deviceMemoryBandwidth: 573.69GiB/s
2023-08-25 12:15:58.521621: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

Epoch 1/100


2023-08-25 12:15:59.674957: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-08-25 12:15:59.814545: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7
2023-08-25 12:16:00.501274: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2023-08-25 12:16:00.589148: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


In [15]:
print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Mean Training Accuracy: 0.9534899614475392
Mean Validation Accuracy: 0.8823971549669901
Mean Training Loss: 0.10858457520522212
Mean Validation Loss: 0.48998429046736824
Test Loss: 0.3145
Test Accuracy: 0.8823


In [16]:
# a RNN - GRU based model

In [17]:
model = Sequential([
    Input(shape=(183, 21)),
    GRU(units=32, return_sequences=True),
    Flatten(),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=64,epochs = 100,validation_split=0.2, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


In [18]:

print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Mean Training Accuracy: 0.9131225926034591
Mean Validation Accuracy: 0.8863143044359544
Mean Training Loss: 0.21726608758463578
Mean Validation Loss: 0.2910834103822708
Test Loss: 0.2889
Test Accuracy: 0.8951


In [19]:
# network with CONV and LSTM layers

In [20]:
# Define the model
model = Sequential([
    Input(shape=(183, 21)),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPool1D(pool_size=2),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPool1D(pool_size=2),
    LSTM(units=128, return_sequences=True),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=128, epochs=100, validation_split=0.2, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


In [21]:
print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Mean Training Accuracy: 0.8972128489438225
Mean Validation Accuracy: 0.8510492987492505
Mean Training Loss: 0.2609128632089671
Mean Validation Loss: 0.4027911407106063
Test Loss: 0.3285
Test Accuracy: 0.8665


# lets try an RNN with other encoding models
Be awre that for a fair comparison the same train test split should be used. 
In this case, we dont have a function to do it automatically, so we will pad sequences until 18 manually

Z-scales

In [22]:
max_length = 183
padding_value = 'X'
df['padded_sequence'] = [seq + padding_value * (max_length - len(seq)) for seq in df['sequence']]

#open 
enconde_df = Encoding(dataset= df ,  col= 'padded_sequence')

zscale = enconde_df.get_zscale()

expanded_arrays =  zscale['zscale'].apply(lambda x: np.array(x))
X = np.array(expanded_arrays.tolist())

y = zscale['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)

train_x (2708, 183, 5)
test_x (1334, 183, 5)


In [23]:
model = Sequential([
    Input(shape=(183, 5)),
    LSTM(units=32, return_sequences=True),
    Flatten(),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=64,epochs = 100,validation_split=0.2, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


In [24]:
print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Mean Training Accuracy: 0.9202611808265958
Mean Validation Accuracy: 0.8773062740053449
Mean Training Loss: 0.2002831588366202
Mean Validation Loss: 0.3188035179461752
Test Loss: 0.3027
Test Accuracy: 0.8793


blosum. lets also try a LSTM 

In [25]:
#open 
enconde_df = Encoding(dataset= df ,  col= 'padded_sequence')

blosum = enconde_df.get_blosum()

expanded_arrays =  blosum['blosum'].apply(lambda x: np.array(x))
X = np.array(expanded_arrays.tolist())

y = blosum['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)


# the input shape needs to be different
model = Sequential([
    Input(shape=(183, 23)),
    LSTM(units=64, return_sequences=True),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train same model
# Train the model
history = model.fit(X_train, y_train, batch_size=64,epochs = 100,validation_split=0.2, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)


print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

train_x (2708, 183, 23)
test_x (1334, 183, 23)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Mean Training Accuracy: 0.9430859397197592
Mean Validation Accuracy: 0.894134116583857
Mean Training Loss: 0.1441328083004417
Mean Validation Loss: 0.31810459340440816
Test Loss: 0.2737
Test Accuracy: 0.9033


In [26]:
#open 
enconde_df = Encoding(dataset= df ,  col= 'padded_sequence')

nlf = enconde_df.get_nlf()

expanded_arrays =  nlf['nlf'].apply(lambda x: np.array(x))
X = np.array(expanded_arrays.tolist())

y = nlf['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)


# the input shape needs to be different
model = Sequential([
    Input(shape=(183, 18)),
    LSTM(units=32, return_sequences=True),
    Flatten(),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train same model
# Train the model
history = model.fit(X_train, y_train, batch_size=64,epochs = 100,validation_split=0.2, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test)


print('===================================')
print("Mean Training Accuracy:", np.mean(history.history['accuracy']))
print("Mean Validation Accuracy:", np.mean(history.history['val_accuracy']))
print("Mean Training Loss:", np.mean(history.history['loss']))
print("Mean Validation Loss:", np.mean(history.history['val_loss']))

print('===================================')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

train_x (2708, 183, 18)
test_x (1334, 183, 18)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Mean Training Accuracy: 0.8813924995752481
Mean Validation Accuracy: 0.8646749911399988
Mean Training Loss: 0.2707398980855942
Mean Validation Loss: 0.32246709958865094
Test Loss: 0.3199
Test Accuracy: 0.8778


These models can also be optimized to search for a better combination of parameters.
Different architectures and encoding schemes are possible. 
One should explore optimization algorithms. 


Below is an example using Propythia. 
We will also use the same Veltri use in the paper describing the data. The modle uses an embedding layer followed by CONV and LSTM. As the model receives an embedding layer one should give the sequences as integers. Here we define a function that assignes a int to eachAminoacid and also do padding using tensorflow. 

In [27]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences


def pad_sequence(sequences):
#     sequences = df['seq'].tolist()
    alphabet = "XARNDCEQGHILKMFPSTWYV"
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # {'X': 0,
    #  'A': 1,
    #  'R': 2,
    #  'N': 3,
    #  'D': 4,...
    sequences_integer_ecoded = []
    for seq in sequences:
        integer_encoded = [char_to_int[char] for char in seq]
        sequences_integer_ecoded.append(integer_encoded)
    fps_x = pad_sequences(sequences_integer_ecoded, maxlen=183, padding='pre', value=0.0)   
    return fps_x



X = pad_sequence(df['sequence'])
y = res['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)

train_x (2708, 183)
test_x (1334, 183)


In [28]:
def veltri_model(units=100):
    model = Sequential()
    model.add(Input(shape=(183,)))
    model.add(Embedding(input_dim=21, output_dim=128, input_length=200, mask_zero=True))
    model.add(Conv1D(
        filters=64,
        kernel_size=16,
        strides=1,
        padding='same',
        activation='relu'))
    model.add(MaxPool1D(pool_size=5, strides=1, padding='same'))
    model.add(LSTM(units=units,
                   dropout=0.1,
                   unroll=True,
                   return_sequences=False,
                   stateful=False))

    # Add Classification Dense, Compile model and make it ready for optimization
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [29]:
from propythia.ml.deep_ml import DeepML
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

dl=DeepML(X_train, y_train, x_test, y_test, number_classes=2, problem_type='binary',
          x_dval=None, y_dval=None, epochs=100, batch_size=64,
          path='', report_name=None, verbose=1,
         early_stopping_patience=30, reduce_lr_patience=20, reduce_lr_factor=0.2, reduce_lr_min=0.00001,
                 )

model = KerasClassifier(build_fn= veltri_model)

# run the model in Propythia
history = dl.run_model(model)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.39377, saving model to weights.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.39377
Epoch 3/100

Epoch 00003: val_loss improved from 0.39377 to 0.35992, saving model to weights.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.35992 to 0.33479, saving model to weights.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.33479 to 0.31303, saving model to weights.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.31303 to 0.28701, saving model to weights.hdf5
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.28701
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.28701
Epoch 9/100

Epoch 00009: val_loss improved from 0.28701 to 0.28632, saving model to weights.hdf5
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.28632
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.28632
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.28632
Epoch 13/100

Epoch 00013: va

<Figure size 640x480 with 0 Axes>

In [30]:
# evaluate on propythia
scores, report, cm, cm2 = dl.score_testset_classification()
scores

{'Accuracy': 0.8920539730134932,
 'MCC': 0.7841114710106724,
 'log_loss': 0.5132134758672886,
 'f1 score': 0.891891891891892,
 'roc_auc': 0.8920539730134934,
 'Precision': array([0.5       , 0.89323308, 1.        ]),
 'Recall': array([1.        , 0.89055472, 0.        ]),
 'fdr': 0.10676691729323308,
 'sn': 0.8905547226386806,
 'sp': 0.8935532233883059}

Lets demonstrate how a Cross validation model would run in ProPythia. Here the X and Y are the total X and Y (not splitted). The model and everything defined above in ml class is the same. We will define CV as 3 just for speed purposes. 

In [31]:
import sys
sys.path.append(r'/home/martinha/propythia/propythia/src/propythia/')
sys.path.append(r'/home/martinha/propythia/propythia/src/')


import pandas as pd
import numpy as np
from Bio import SeqIO

from propythia.protein.sequence import ReadSequence
from propythia.protein.encoding import Encoding

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Conv1D,Conv2D, Flatten, MaxPool1D,MaxPool2D, Dropout, Input,GRU
from tensorflow.keras.layers import Embedding, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Define callbacks
checkpoint = ModelCheckpoint(filepath='best_model.h5', save_best_only=True, monitor='val_accuracy', mode='max')
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)




amps_file = './AMP_Scan2_Feb2020_Dataset/AMPS_02182020.fasta'
non_amps_file = './AMP_Scan2_Feb2020_Dataset/DECOYS_02182020.fasta'


sequences = SeqIO.parse(amps_file, "fasta")
data = []

for record in sequences:
    data.append([record.id, str(record.seq), 1])

sequences = SeqIO.parse(non_amps_file, "fasta")
for record in sequences:
    data.append([record.id, str(record.seq), 0])

df = pd.DataFrame(data, columns=["ID", "sequence", 'label'])
print(df.head())

read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset= df, col = 'sequence', B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')


from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences


def pad_sequence(sequences):
#     sequences = df['seq'].tolist()
    alphabet = "XARNDCEQGHILKMFPSTWYV"
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # {'X': 0,
    #  'A': 1,
    #  'R': 2,
    #  'N': 3,
    #  'D': 4,...
    sequences_integer_ecoded = []
    for seq in sequences:
        integer_encoded = [char_to_int[char] for char in seq]
        sequences_integer_ecoded.append(integer_encoded)
    fps_x = pad_sequences(sequences_integer_ecoded, maxlen=183, padding='pre', value=0.0)   
    return fps_x



X = pad_sequence(df['sequence'])
y = res['label']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', X_train.shape)
print('test_x', x_test.shape)

def veltri_model(units=100):
    model = Sequential()
    model.add(Input(shape=(183,)))
    model.add(Embedding(input_dim=21, output_dim=128, input_length=200, mask_zero=True))
    model.add(Conv1D(
        filters=64,
        kernel_size=16,
        strides=1,
        padding='same',
        activation='relu'))
    model.add(MaxPool1D(pool_size=5, strides=1, padding='same'))
    model.add(LSTM(units=units,
                   dropout=0.1,
                   unroll=True,
                   return_sequences=False,
                   stateful=False))

    # Add Classification Dense, Compile model and make it ready for optimization
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

        ID                                          sequence  label
0  AP02484                             GMASKAGSVLGKITKIALGAL      1
1  AP02630       NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC      1
2  AP01427                    GAIKDALKGAAKTVAVELLKKAQCKLEKTC      1
3  AP02983                             FFGRLKAVFRGARQGWKEHRY      1
4  AP01815  DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG      1
train_x (2708, 183)
test_x (1334, 183)


In [33]:
# try a CV model
from propythia.ml.deep_ml import DeepML
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

dl=DeepML(X, y, x_test = None, y_test = None, number_classes=2, problem_type='binary',
          x_dval=None, y_dval=None, epochs=100, batch_size=64,
          path='', report_name=None, verbose=1,
         early_stopping_patience=30, reduce_lr_patience=20, reduce_lr_factor=0.2, reduce_lr_min=0.00001,
                 )

model = KerasClassifier(build_fn= veltri_model)


veltri_cv = dl.train_model_cv(x_cv = X, y_cv = y, cv=5, model=model)
veltri_cv



Fold  0
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.44123, saving model to weights.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.44123 to 0.26787, saving model to weights.hdf5
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.26787
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.26787
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.26787
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.26787
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.26787
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.26787
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.26787
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.26787
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.26787
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.26787
Epoch 13/100

Epoch 00013: val_loss did not improve from 0.26787
Epoch 14/100

Epoch 00014: val_loss improved from 0.26787 to 0.11624, saving model to weigh


Epoch 00042: val_loss did not improve from 0.11624
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.11624
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.11624
Epoch 00044: early stopping

Fold  1
Epoch 1/100

Epoch 00001: val_loss did not improve from 0.11624
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.11624
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.11624
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.11624
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.11624
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.11624
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.11624
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.11624
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.11624
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.11624
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.11624
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.11624
Epoch 13/1


Epoch 00039: val_loss did not improve from 0.09354
Epoch 40/100

Epoch 00040: val_loss did not improve from 0.09354
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.09354
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.09354
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.09354
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.09354
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.09354
Epoch 00045: early stopping

Fold  2
Epoch 1/100

Epoch 00001: val_loss did not improve from 0.09354
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.09354
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.09354
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.09354
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.09354
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.09354
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.09354
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.09354
Epoch 9/1


Epoch 00036: val_loss did not improve from 0.09354
Epoch 37/100

Epoch 00037: val_loss did not improve from 0.09354
Epoch 38/100

Epoch 00038: val_loss did not improve from 0.09354
Epoch 39/100

Epoch 00039: val_loss did not improve from 0.09354
Epoch 40/100

Epoch 00040: val_loss did not improve from 0.09354
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.09354
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.09354
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.09354
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.09354
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.09354
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.09354
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.09354
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.09354
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.09354
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.09354
Epoch 51/100

Epoch 00051: val_loss di


Epoch 00015: val_loss did not improve from 0.09354
Epoch 16/100

Epoch 00016: val_loss did not improve from 0.09354
Epoch 17/100

Epoch 00017: val_loss did not improve from 0.09354
Epoch 18/100

Epoch 00018: val_loss did not improve from 0.09354
Epoch 19/100

Epoch 00019: val_loss did not improve from 0.09354
Epoch 20/100

Epoch 00020: val_loss did not improve from 0.09354
Epoch 21/100

Epoch 00021: val_loss did not improve from 0.09354
Epoch 22/100

Epoch 00022: val_loss did not improve from 0.09354
Epoch 23/100

Epoch 00023: val_loss did not improve from 0.09354
Epoch 24/100

Epoch 00024: val_loss did not improve from 0.09354
Epoch 25/100

Epoch 00025: val_loss did not improve from 0.09354
Epoch 26/100

Epoch 00026: val_loss did not improve from 0.09354
Epoch 27/100

Epoch 00027: val_loss did not improve from 0.09354
Epoch 28/100

Epoch 00028: val_loss did not improve from 0.09354
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.

Epoch 0


Epoch 00017: val_loss did not improve from 0.09354
Epoch 18/100

Epoch 00018: val_loss did not improve from 0.09354
Epoch 19/100

Epoch 00019: val_loss did not improve from 0.09354
Epoch 20/100

Epoch 00020: val_loss did not improve from 0.09354
Epoch 21/100

Epoch 00021: val_loss did not improve from 0.09354
Epoch 22/100

Epoch 00022: val_loss did not improve from 0.09354
Epoch 23/100

Epoch 00023: val_loss did not improve from 0.09354
Epoch 24/100

Epoch 00024: val_loss did not improve from 0.09354
Epoch 25/100

Epoch 00025: val_loss did not improve from 0.09354
Epoch 26/100

Epoch 00026: val_loss did not improve from 0.09354
Epoch 27/100

Epoch 00027: val_loss did not improve from 0.09354
Epoch 28/100

Epoch 00028: val_loss did not improve from 0.09354
Epoch 29/100

Epoch 00029: val_loss did not improve from 0.09354
Epoch 30/100

Epoch 00030: val_loss did not improve from 0.09354
Epoch 31/100

Epoch 00031: val_loss did not improve from 0.09354
Epoch 32/100

Epoch 00032: val_loss di

Unnamed: 0,Accuracy,MCC,log_loss,f1 score,roc_auc,Precision,Recall,fdr,sn,sp
0,0.906057,0.812113,0.531961,0.906173,0.906057,"[0.5006180469715699, 0.9061728395061729, 1.0]","[1.0, 0.9061728395061729, 0.0]",0.093827,0.906173,0.905941
1,0.908529,0.818281,0.531906,0.910843,0.908563,"[0.49938195302843014, 0.8873239436619719, 1.0]","[1.0, 0.9356435643564357, 0.0]",0.112676,0.935644,0.881481
2,0.902228,0.804458,0.571435,0.902107,0.902228,"[0.5, 0.9032258064516129, 1.0]","[1.0, 0.900990099009901, 0.0]",0.096774,0.90099,0.903465
3,0.893564,0.787602,0.321176,0.891688,0.893564,"[0.5, 0.9076923076923077, 1.0]","[1.0, 0.8762376237623762, 0.0]",0.092308,0.876238,0.910891
4,0.891089,0.783562,0.585931,0.894231,0.891089,"[0.5, 0.8691588785046729, 1.0]","[1.0, 0.9207920792079208, 0.0]",0.130841,0.920792,0.861386
mean,0.900293,0.801203,0.508482,0.901008,0.9003,,,0.105285,0.907967,0.892633
std,0.006853,0.013546,0.096068,0.007175,0.006861,,,0.014696,0.019946,0.018593
