In [7]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from pyDeepInsight import ImageTransformer
from data_ingestion.read_protT5_data import train_df, test_df
from models.CNN import CNN

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from models.LSTM import AttLSTM
from tensorflow.keras.callbacks import EarlyStopping

from models.PLS import PLSDA
from models.VotingClassifier import CustomVotingClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

In [50]:
def features_to_images(X_train: np.ndarray, X_test: np.ndarray, pixels=(16, 16), method='tsne'):
    it = ImageTransformer(feature_extractor=method, discretization='bin', pixels=pixels)

    X_train_img = it.fit_transform(X_train)
    X_test_img = it.transform(X_test)

    X_train_img = np.asarray(X_train_img, dtype=np.float32)
    X_test_img = np.asarray(X_test_img, dtype=np.float32)

    if X_train_img.ndim == 3:
        X_train_img = X_train_img[..., None]
    if X_test_img.ndim == 3:
        X_test_img = X_test_img[..., None]

    X_train_img -= X_train_img.min()
    if X_train_img.max() != 0:
        X_train_img /= X_train_img.max()

    X_test_img -= X_test_img.min()
    if X_test_img.max() != 0:
        X_test_img /= X_test_img.max()

    return X_train_img, X_test_img


In [51]:
X_train = train_df.drop(['label'], axis=1).to_numpy()
y_train = train_df['label']

X_test = test_df.drop(['label'], axis=1).to_numpy()
y_test = test_df['label']

X_train, X_test = features_to_images(np.array(X_train), np.array(X_test))

In [None]:
cnn = CNN((16, 16, 3), learning_rate=1e-4)

cnn.fit(X_train, y_train, X_test, y_test, 200)

Epoch 1/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 99ms/step - accuracy: 0.5361 - auc: 0.8153 - loss: 0.8776 - val_accuracy: 0.5405 - val_auc: 0.5000 - val_loss: 0.6918
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5979 - auc: 0.6563 - loss: 0.7287 - val_accuracy: 0.5405 - val_auc: 0.4998 - val_loss: 0.6907
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6649 - auc: 0.7454 - loss: 0.6462 - val_accuracy: 0.5405 - val_auc: 0.5000 - val_loss: 0.6901
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.7113 - auc: 0.7652 - loss: 0.6180 - val_accuracy: 0.5405 - val_auc: 0.4825 - val_loss: 0.6899
Epoch 5/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6082 - auc: 0.6602 - loss: 0.7492 - val_accuracy: 0.5405 - val_auc: 0.4924 - val_loss: 0.6899
Epoch 6/200
[1m7/7[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x3b04052e0>

In [1]:
from data_ingestion.read_data import train_df, test_df
from feature_engineering.feature_scaling import FeatureScaling
from models.VotingClassifier import CustomVotingClassifier
from models.PLS import PLSDA

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import pandas as pd

rare_aas = 'UOZB'
train_df['peptide_name'] = train_df['peptide_name'].str.upper().replace("UOZB", "X", regex=True)
test_df['peptide_name'] = test_df['peptide_name'].str.upper().replace("UOZB", "X", regex=True)

scaler = FeatureScaling(['AAC', 'APAAC', 'PAAC', 'TPC'], 'noen', 500)

raw_train = scaler.feature_encoder(train_df)
X_train = pd.DataFrame(scaler.feature_reduction(raw_train, train_df['label']))

raw_test = scaler.feature_encoder(test_df)
X_test = pd.DataFrame(scaler.feature_reduction(raw_test, test_df['label'], False))

y_train, y_test = train_df['label'], test_df['label']

In [2]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8056,8057,8058,8059,8060,8061,8062,8063,8064,8065
0,-0.113106,1.026063,-0.890028,-0.564865,-0.711438,-0.449217,-0.616772,0.257091,0.928636,1.001111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.031467,-0.432133,-0.890028,-0.564865,-0.711438,-0.449217,-0.616772,0.502431,-0.523912,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.842389,1.639815,-0.890028,-0.564865,-0.711438,-0.449217,-0.616772,-0.980493,1.540010,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.740843,1.150697,-0.890028,-0.564865,-0.711438,1.339185,0.750591,-0.308811,-0.523912,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.842389,-0.432133,1.255579,2.574968,0.826007,-0.449217,-0.616772,-0.980493,-0.523912,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,-0.842389,7.204306,1.913074,1.486133,-0.711438,-0.449217,-0.616772,-0.980493,2.011708,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
773,-0.842389,-0.432133,1.354230,-0.564865,0.896696,4.156150,-0.616772,-0.115659,-0.523912,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
774,-0.842389,-0.432133,1.657351,-0.564865,1.113899,-0.449217,-0.616772,0.001149,1.780387,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
775,1.645455,-0.432133,0.022951,0.771169,-0.711438,-0.449217,-0.616772,1.130428,-0.523912,-0.425096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
