In [338]:
%load_ext autoreload
%autoreload 2

from data_ingestion.read_data import train_df, test_df
from feature_engineering.feature_scaling import FeatureScaling
from feature_engineering.kmer_encoding import Kmer

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from models.LSTM import AttLSTM
from tensorflow.keras.callbacks import EarlyStopping
from models.CNN import CNN

from sklearn.preprocessing import MinMaxScaler
from gensim.models import KeyedVectors

import pandas as pd
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [296]:
rare_aas = 'UOZB' 
train_df['peptide_name'] = train_df['peptide_name'].str.upper().replace(
    "UOZB", "X", regex=True)

test_df['peptide_name'] = test_df['peptide_name'].str.upper().replace(
    'UOZB', "X", regex=True)

In [375]:
scaler = FeatureScaling(['AAC', 'APAAC', 'PAAC', 'TPC'], 'PCA', 500)

raw_train = scaler.feature_encoder(train_df)
X_train = pd.DataFrame(scaler.feature_reduction(raw_train, train_df['label']))

raw_test = scaler.feature_encoder(test_df)
X_test = pd.DataFrame(scaler.feature_reduction(raw_test, test_df['label'], False))

y_train, y_test = train_df['label'], test_df['label']

In [332]:
X_train, X_test = Kmer(train_df, 3).encode_features(), Kmer(test_df, 3).encode_features()
y_train, y_test = train_df['label'], test_df['label']

In [333]:
model = KeyedVectors.load("models/protVec_100d_3grams.model", mmap='r')

def embed_protein_kmers(kmers_list, model, vector_size):
    zero_vec = np.zeros(vector_size, dtype=np.float32)
    embeddings = []

    for kmer in kmers_list:
        try:
            vec = model[kmer]  
        except KeyError:
            vec = zero_vec  
        embeddings.append(vec)

    embeddings = np.array(embeddings, dtype=np.float32)
    if embeddings.shape[0] == 0:
        return zero_vec  

    return np.array(embeddings).mean(axis=0)

X_train = np.array([embed_protein_kmers(seq, model, 100) for seq in X_train])
X_test = np.array([embed_protein_kmers(seq, model, 100) for seq in X_test])

In [334]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [376]:
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

In [377]:
input_shape = (777, 500)

model = AttLSTM(input_shape=input_shape, learning_rate=1e-4)

early_stopping = EarlyStopping(
        monitor='val_accuracy',     
        mode='max',                  
        patience=300,
        restore_best_weights=True,   
        verbose=1
    )

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test), 
    epochs=300,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5135 - loss: 0.6931 - val_accuracy: 0.5464 - val_loss: 0.6925
Epoch 2/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6319 - loss: 0.6873 - val_accuracy: 0.5515 - val_loss: 0.6916
Epoch 3/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7001 - loss: 0.6821 - val_accuracy: 0.5979 - val_loss: 0.6906
Epoch 4/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7284 - loss: 0.6760 - val_accuracy: 0.5979 - val_loss: 0.6895
Epoch 5/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7503 - loss: 0.6682 - val_accuracy: 0.5928 - val_loss: 0.6877
Epoch 6/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7967 - loss: 0.6570 - val_accuracy: 0.6082 - val_loss: 0.6852
Epoch 7/300
[1m25/25[0m [32m━━

In [379]:
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")


tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred_classes)

print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")

results = {
    'Fold': 'Test',
    "Accuracy": round(accuracy, 4),
    "Sensitivity": round(sensitivity, 4),
    "Specificity": round(specificity, 4),
    "AUC": round(auc, 4),
    "MCC": round(mcc, 4)
}

lstm_results = pd.DataFrame([results]).to_csv('results/lstm.csv', index=False)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.7113
Sensitivity (Recall): 0.6966
Specificity: 0.7238
AUC: 0.7403
MCC: 0.4198


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, initializers

class FeatureReshaper(tf.keras.Model):
    def __init__(self, input_dim, reshape_dim=(25, 20)):
        super(FeatureReshaper, self).__init__()
        self.input_dim = input_dim
        self.reshape_dim = reshape_dim
        self.reshape_size = reshape_dim[0] * reshape_dim[1]

        self.reshape_dense = layers.Dense(
            self.reshape_size,
            activation="relu",
            kernel_initializer=initializers.GlorotUniform(),
            name="reshape_dense"
        )

    def call(self, inputs):
        x = self.reshape_dense(inputs)
        x = tf.reshape(x, (-1, self.reshape_dim[0], self.reshape_dim[1], 1))
        return x


reshaper = FeatureReshaper(input_dim=500, reshape_dim=(25, 20))

reshaper.build(input_shape=(None, 500))

X_train = reshaper(X_train).numpy()
X_test = reshaper(X_test).numpy()


In [364]:
cnn = CNN(input_shape=(25, 20, 1))

cnn.fit(X_train, y_train, X_test, y_test, epochs=60, batch_size=8)

Epoch 1/60
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.5315 - auc: 0.5533 - loss: 0.8962 - val_accuracy: 0.5309 - val_auc: 0.4875 - val_loss: 0.6914
Epoch 2/60
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5393 - auc: 0.5554 - loss: 0.8697 - val_accuracy: 0.5464 - val_auc: 0.4862 - val_loss: 0.7009
Epoch 3/60
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5251 - auc: 0.5304 - loss: 0.8912 - val_accuracy: 0.5258 - val_auc: 0.5259 - val_loss: 0.7403
Epoch 4/60
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5264 - auc: 0.5483 - loss: 0.8742 - val_accuracy: 0.5515 - val_auc: 0.5525 - val_loss: 0.7307
Epoch 5/60
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5457 - auc: 0.5566 - loss: 0.8783 - val_accuracy: 0.5619 - val_auc: 0.5576 - val_loss: 0.7585
Epoch 6/60
[1m98/98[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x420b5cfb0>

In [374]:
y_pred = cnn.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")


tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred_classes)

print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")

results = {
    'Fold': 'Test',
    "Accuracy": round(accuracy, 4),
    "Sensitivity": round(sensitivity, 4),
    "Specificity": round(specificity, 4),
    "AUC": round(auc, 4),
    "MCC": round(mcc, 4)
}

cnn_results = pd.DataFrame([results]).to_csv('results/cnn.csv', index=False)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Accuracy: 0.5722
Sensitivity (Recall): 0.7978
Specificity: 0.3810
AUC: 0.6797
MCC: 0.1945
