In [1]:
%load_ext autoreload
%autoreload 2

from data_ingestion.read_data import train_df, test_df
from feature_engineering.feature_scaling import FeatureScaling
from feature_engineering.kmer_encoding import Kmer
from feature_engineering.one_hot import OneHotEncoder
from models.VotingClassifier import CustomVotingClassifier
from models.PLS import PLSDA

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from models.LSTM import AttLSTM

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from gensim.models import KeyedVectors

import shap
import pandas as pd
import numpy as np


In [None]:
rare_aas = 'UOZB' 
train_df['peptide_name'] = train_df['epitope_name'].str.upper().replace(
    "UOZB", "X", regex=True)

test_df['epitope_name'] = test_df['epitope_name'].str.upper().replace(
    'UOZB', "X", regex=True)

In [3]:
scaler = FeatureScaling(['AAC', 'APAAC', 'PAAC', 'TPC'], 'PCA', 500)

raw_train = scaler.feature_encoder(train_df)
X_train = pd.DataFrame(scaler.feature_reduction(raw_train, train_df['label']))

raw_test = scaler.feature_encoder(test_df)
X_test = pd.DataFrame(scaler.feature_reduction(raw_test, test_df['label'], False))

y_train, y_test = train_df['label'], test_df['label']

In [4]:
model = CustomVotingClassifier(
                estimators=[
                    ('pls', PLSDA(n_components=15)),
                    ('log', LogisticRegression(C=0.0027059021490395217,
                                               max_iter=5000,
                                               penalty='l2',
                                               solver='liblinear',
                                               random_state=42)),
                    ('svm', SVC(C=0.001098881873199668,
                                kernel='rbf',
                                gamma=0.010951255895390475754,
                                probability=True,
                                random_state=42)),
                    ('rf', RandomForestClassifier(
                        n_estimators=401,
                        max_depth=40,
                        min_samples_split=7,
                        min_samples_leaf=1,
                        max_features='log2',
                        random_state=42
                    )),
                    ('xgb', XGBClassifier(
                        n_esimators=700,
                        max_depth=15,
                        random_state=42)),
                    ('knn', KNeighborsClassifier()),
                    
                    
                ],
                voting='soft',
                weights=[1, 1, 1, 1, 1, 1]
)

In [5]:
model.fit(X_train, y_train)

0,1,2
,estimators,"[('pls', ...), ('log', ...), ...]"
,voting,'soft'
,weights,"[1, 1, ...]"


In [6]:
test_results = []

y_test_pred = model.predict(X_test)

y_test_prob = model.predict_proba(X_test)[:, 1]

acc_test = model.score(X_test, y_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
specificity_test = tn / (tn + fp)
sensitivity_test = tp / (tp + fn)

auc_test = roc_auc_score(y_test, y_test_prob)
mcc_test = matthews_corrcoef(y_test, y_test_pred)

test_results.append({
    'ACCURACY': round(acc_test, 4),
    'SENSITIVITY': round(sensitivity_test, 4),
    'SPECIFICITY': round(specificity_test, 4),
    'AUC SCORE': round(auc_test, 4),
    'MCC SCORE': round(mcc_test, 4)
})

In [7]:
test_results

[{'ACCURACY': 0.7423,
  'SENSITIVITY': 0.7865,
  'SPECIFICITY': 0.7048,
  'AUC SCORE': 0.783,
  'MCC SCORE': 0.49}]

In [179]:
X_train, X_test = Kmer(train_df, 3).encode_features(), Kmer(test_df, 3).encode_features()
y_train, y_test = train_df['label'], test_df['label']

In [174]:
X_train, X_test = OneHotEncoder().encode_many(train_df['epitope_name']), OneHotEncoder().encode_many(test_df['epitope_name'])
y_train, y_test = train_df['label'], test_df['label']

# num_samples, seq_len, num_aa = X_train.shape
# X_train = X_train.reshape(num_samples, seq_len * num_aa)

# num_samples, seq_len, num_aa = X_test.shape
# X_test = X_test.reshape(num_samples, seq_len * num_aa)


In [168]:
model = KeyedVectors.load("models/protVec_100d_3grams.model", mmap='r')

def embed_protein_kmers(kmers_list, model, vector_size):
    zero_vec = np.zeros(vector_size, dtype=np.float32)
    embeddings = []

    for kmer in kmers_list:
        try:
            vec = model[kmer]  
        except KeyError:
            vec = zero_vec  
        embeddings.append(vec)

    embeddings = np.array(embeddings, dtype=np.float32)
    if embeddings.shape[0] == 0:
        return zero_vec  

    return np.array(embeddings).mean(axis=0)

X_train = np.array([embed_protein_kmers(seq, model, 100) for seq in X_train])
X_test = np.array([embed_protein_kmers(seq, model, 100) for seq in X_test])

In [169]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [170]:
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

In [175]:
input_shape = (777, 500)

model = AttLSTM(input_shape=input_shape, learning_rate=1e-4)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test), 
    epochs=200,
    batch_size=32,
    verbose=1
)

Epoch 1/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.4916 - loss: 0.6932 - val_accuracy: 0.5258 - val_loss: 0.6929
Epoch 2/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5393 - loss: 0.6924 - val_accuracy: 0.5412 - val_loss: 0.6922
Epoch 3/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5405 - loss: 0.6918 - val_accuracy: 0.5412 - val_loss: 0.6919
Epoch 4/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5405 - loss: 0.6913 - val_accuracy: 0.5412 - val_loss: 0.6912
Epoch 5/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5405 - loss: 0.6906 - val_accuracy: 0.5412 - val_loss: 0.6906
Epoch 6/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5405 - loss: 0.6898 - val_accuracy: 0.5412 - val_loss: 0.6898
Epoch 7/200
[1m25/25[0m [32m━━

In [177]:
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")


tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred_classes)

print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy: 0.6392
Sensitivity (Recall): 0.6629
Specificity: 0.6190
AUC: 0.6633
MCC: 0.2811
