# CNN predictor in Keras using BLOSUM

In this notebook we demonstrate the application of `peptidy` in predicting antimicrobial peptides (AMPs) using a convolutional neural network (CNN) set up in Keras. BLOSUM62 encoding for amino acid sequences is performed with `peptidy`, which is used as input for the CNN.



In [3]:
!pip install peptidy

Collecting peptidy
  Downloading peptidy-0.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading peptidy-0.0.1-py3-none-any.whl (21 kB)
Installing collected packages: peptidy
Successfully installed peptidy-0.0.1


In [4]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D, Input

from peptidy.encoding import blosum62_encoding
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

### Load a dataframe with peptides

In [5]:
csv_url = 'https://raw.githubusercontent.com/AryaVenkatesh2010/AryaAIProject/refs/heads/main/subsample_AMP.csv'
subsample_AMP = pd.read_csv(csv_url)

X=subsample_AMP.drop('active',axis=1)
y=subsample_AMP['active']

### Encode and split the data

In [6]:
# Encode the data using peptidy
X_encoded = list()
for i in range(0, len(subsample_AMP)):
    X_encoded.append(blosum62_encoding(X['sequence'][i],padding_len=50))

# Convert the list to a tensor
X_encoded = np.array(X_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


### Define the model

In [8]:
subsample_AMP

Unnamed: 0,sequence,active
0,KKPLTDNGATSRHM,0
1,YRITRGDMFYVSAHDRHSYESVHELELDNILYIRNRLTLSADWQT,0
2,ETLIQTIESVRNVEGVLA,0
3,SSLSPILMDSFGDDLQKLKE,0
4,GRIVDLVEKRV,0
5,FHDRGPETLKCLYDESEDNNNF,0
6,ELTEFKPASEVQEPNEVKMSSGAHAGLQSAEQVAEQ,0
7,ADRTYFVTSSKDKSARLYDSRTLEVI,0
8,RNALYKWEFEESEEDPV,0
9,KITITECIYCGKDNKEVERNVKHMFSEHGLFIPERSYLIDLNGLLEFL,0


In [9]:
def CNN_model(max_sequence_length,
              len_encoding_vector=20,
              kernel_size_1D_layers=3,
              dropout=0.1,
              n_dense_layers=2,
              learning_rate=0.001,
              loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],
              n_1D_conv=5,
              activation='relu',
              n_filters=10,
              n_dense_neurons=30):

    # Define the length of the input sequence and the number of possible amino acids
    input_length=int(max_sequence_length)
    model=Sequential()
    model.add(Input(shape=(input_length, len_encoding_vector)))
    for layer_ix in range(n_1D_conv):
        model.add(Conv1D(filters=(layer_ix + 1) * n_filters, kernel_size=kernel_size_1D_layers, activation=activation,padding='same'))
        model.add(Dropout(dropout))

    model.add(MaxPooling1D())
    model.add(Flatten())

    for layer_ix in range(n_dense_layers):
            model.add(Dense(
                max(n_dense_neurons // (2**layer_ix), 1),
                activation='relu',
            ))
            model.add(Dropout(dropout))
    model.add(Dense(1,activation='sigmoid'))

    # Compile the model with binary crossentropy loss function and adam optimizer
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall()])
    return model


### Train the model

In [10]:
model = CNN_model(max_sequence_length=50, len_encoding_vector=21, kernel_size_1D_layers=7, dropout=0.2, n_dense_layers=5, learning_rate=0.0001,
                  loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], n_1D_conv=2, activation='relu', n_filters=2, n_dense_neurons=32)
model.fit(X_train, y_train, epochs=20, batch_size=30,verbose=0)

<keras.src.callbacks.history.History at 0x7cabd936cec0>

### Evaluate the model

In [11]:
loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
print(accuracy)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step - accuracy: 0.6000 - loss: 0.7089 - precision: 1.0000 - recall: 0.4286
0.6000000238418579
