In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
positive_df = pd.read_csv('generateDB/data/sequence_data.csv')
positive_df['label'] = 1
negative_df = pd.read_csv('generateDB/data/negative_data.csv')
negative_df['label'] = 0

# combine the dataframes
df = pd.concat([positive_df, negative_df], ignore_index=True)

# shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

df.head()


Unnamed: 0,Sequence 1,Sequence 2,label
0,MVAHNQVAADNAVSTAAEPRRRPEPSSSSSSSPAAPARPRPCPAVP...,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,1
1,MFRTKRSALVRRLWRSRAPGGEDEEEGAGGGGGGGELRGEGATDSR...,MFRTKRSALVRRLWRSRAPGGEDEEEGAGGGGGGGELRGEGATDSR...,1
2,MPSRTGPKMEGSGGRVRLKAHYGGDIFITSVDAATTFEELCEEVRD...,MKSNQERSNECLPPKKREIPATSRSSEEKAPTLPSDNHRVEGTAWL...,0
3,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,MVKISFQPAVAGIKGDKADKASASAPAPASATEILLTPAREEQPPQ...,0
4,MQSKVLLAVALWLCVETRAASVGLPSVSLDLPRLSIQKDILTIKAN...,MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSL...,1


In [3]:
from src.feature_extractor import extract_features

# extract features from the sequences 1
features_1 = df['Sequence 1'].apply(extract_features)
# convert the features to a numpy array
features_1 = np.array(features_1.tolist())

# extract features from the sequences 2
features_2 = df['Sequence 2'].apply(extract_features)
# convert the features to a numpy array
features_2 = np.array(features_2.tolist())

# extract the labels
labels = df['label'].values

In [4]:
feature_df = pd.DataFrame(columns=['molecular_weight_1', 'isoelectric_point_1', 'alanine_1', 'arginine_1', 'asparagine_1', 'aspartic_acid_1', 'cysteine_1', 'glutamic_acid_1', 'glutamine_1', 'glycine_1', 'histidine_1', 'isoleucine_1', 'leucine_1', 'lysine_1', 'methionine_1', 'phenylalanine_1', 'proline_1', 'serine_1', 'threonine_1', 'tryptophan_1', 'tyrosine_1', 'valine_1', 'molecular_weight_2', 'isoelectric_point_2', 'alanine_2', 'arginine_2', 'asparagine_2', 'aspartic_acid_2', 'cysteine_2', 'glutamic_acid_2', 'glutamine_2', 'glycine_2', 'histidine_2', 'isoleucine_2', 'leucine_2', 'lysine_2', 'methionine_2', 'phenylalanine_2', 'proline_2', 'serine_2', 'threonine_2', 'tryptophan_2', 'tyrosine_2', 'valine_2', 'label'])

for i in tqdm(range(len(features_1))):
    feature_df.loc[i] = np.concatenate((features_1[i], features_2[i], [labels[i]]))
    
feature_df.head()

100%|██████████| 65393/65393 [04:05<00:00, 265.97it/s]


Unnamed: 0,molecular_weight_1,isoelectric_point_1,alanine_1,arginine_1,asparagine_1,aspartic_acid_1,cysteine_1,glutamic_acid_1,glutamine_1,glycine_1,...,lysine_2,methionine_2,phenylalanine_2,proline_2,serine_2,threonine_2,tryptophan_2,tyrosine_2,valine_2,label
0,23550.6179,10.979691,0.123223,0.023697,0.037915,0.042654,0.056872,0.047393,0.037915,0.023697,...,0.055328,0.05123,0.025615,0.040984,0.080943,0.060451,0.079918,0.014344,0.043033,1.0
1,46425.2922,8.630592,0.07277,0.044601,0.042254,0.056338,0.032864,0.115023,0.028169,0.023474,...,0.023474,0.077465,0.039906,0.070423,0.077465,0.035211,0.051643,0.018779,0.030516,1.0
2,67659.2067,5.485838,0.045608,0.027027,0.084459,0.072635,0.048986,0.070946,0.030405,0.060811,...,0.026994,0.106748,0.089571,0.035583,0.109202,0.058896,0.063804,0.003681,0.023313,0.0
3,59834.0535,7.103817,0.08209,0.016791,0.039179,0.078358,0.039179,0.080224,0.016791,0.029851,...,0.041199,0.044944,0.037453,0.074906,0.052434,0.048689,0.086142,0.003745,0.048689,0.0
4,151525.0639,5.600141,0.050147,0.024336,0.050885,0.075959,0.030236,0.062684,0.019174,0.058997,...,0.039693,0.044814,0.06274,0.049936,0.053777,0.06146,0.069142,0.008963,0.021767,1.0


In [5]:
feature_df.to_csv('generateDB/data/feature_data.csv', index=False)

In [6]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_df.drop('label', axis=1), feature_df['label'], test_size=0.2, random_state=42)

# scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# use GNN to train the model
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [10]:
# Define model architecture
input_layer = Input(shape=(X_train.shape[1],))
hidden_layer_1 = Dense(64, activation='relu')(input_layer)
hidden_layer_2 = Dense(32, activation='relu')(hidden_layer_1)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)

In [11]:
# Build model
model = Model(input_layer, output_layer)

# Compile model
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])



In [12]:
# Train model
model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1f9414b6e50>

: 