In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = '/content/drive/MyDrive/Dataset/'

In [1]:
import pandas as pd

In [3]:
positive_df = pd.read_csv('positive_100.csv', header=None)
negative_df = pd.read_csv('negative_100.csv', header=None)

In [4]:
def load_protein_sequences(file_path):
    protein_dict = {}
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                protein_dict[parts[0]] = parts[1]
    return protein_dict
protein_sequences = load_protein_sequences('protein_sequences.txt')

In [5]:
positive_df_2 = pd.DataFrame([positive_df[1],positive_df[0]]).transpose()
negative_df_2 = pd.DataFrame([negative_df[1],negative_df[0]]).transpose()

In [6]:
positive_df = pd.concat([positive_df,positive_df_2])
negative_df = pd.concat([negative_df,negative_df_2])

In [7]:
positive_df[2] = 1
negative_df[2] = 0
data = pd.concat([positive_df, negative_df],ignore_index=True)

In [8]:
data = data.sample(frac=1,random_state=42).reset_index(drop=True)

In [9]:
data[0] = [protein_sequences[x] for x in data[0]]
data[1] = [protein_sequences[x] for x in data[1]]

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
sequences1 = data[0].values  # Amino acid sequences of protein 1
sequences2 = data[1].values  # Amino acid sequences of protein 2
labels = data[2].values     # Interaction labels (0 or 1)

In [12]:
tokenizer = Tokenizer(char_level=True)  # Tokenize at character level
tokenizer.fit_on_texts(sequences1 + sequences2)

# Convert sequences to numerical tokens
encoded_sequences1 = tokenizer.texts_to_sequences(sequences1)
encoded_sequences2 = tokenizer.texts_to_sequences(sequences2)

# Pad sequences to a fixed length (choose an appropriate maxlen)
maxlen = 100  # Example max length, adjust as needed
padded_sequences1 = pad_sequences(encoded_sequences1, maxlen=maxlen, padding='post')
padded_sequences2 = pad_sequences(encoded_sequences2, maxlen=maxlen, padding='post')

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the number of folds
k = 5  # You can adjust this value

# Create KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Shuffle and set random_state for reproducibility

# Lists to store evaluation results
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Iterate through folds
for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences1)):  # Assuming padded_sequences1 and padded_sequences2 have the same length
    print(f"Fold {fold + 1}")

    # Split data into training and testing sets for this fold
    train_sequences1_fold, test_sequences1_fold = padded_sequences1[train_index], padded_sequences1[test_index]
    train_sequences2_fold, test_sequences2_fold = padded_sequences2[train_index], padded_sequences2[test_index]
    train_labels_fold, test_labels_fold = labels[train_index], labels[test_index]

    # Create and compile the model (same as before)
    vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size for embedding layer

    # Input layers for protein sequences
    input1 = Input(shape=(maxlen,))
    input2 = Input(shape=(maxlen,))

    # Embedding layers to represent amino acids as vectors
    embedding_layer = Embedding(vocab_size, 128)  # 128-dimensional embeddings
    embedded_sequences1 = embedding_layer(input1)
    embedded_sequences2 = embedding_layer(input2)

    # LSTM layers to process the sequences
    lstm_layer = LSTM(64)  # 64 LSTM units
    lstm_output1 = lstm_layer(embedded_sequences1)
    lstm_output2 = lstm_layer(embedded_sequences2)

    # Concatenate LSTM outputs
    merged = concatenate([lstm_output1, lstm_output2])

    # Dense layers for classification
    output = Dense(1, activation='sigmoid')(merged)

    # Create the model
    model = Model(inputs=[input1, input2], outputs=output)

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model on this fold's training data
    model.fit([train_sequences1_fold, train_sequences2_fold], train_labels_fold, epochs=20, batch_size=32)

    # Evaluate the model on this fold's testing data
    predictions = model.predict([test_sequences1_fold, test_sequences2_fold])
    predicted_labels_fold = (predictions > 0.5).astype(int)

    accuracy = accuracy_score(test_labels_fold, predicted_labels_fold)
    precision = precision_score(test_labels_fold, predicted_labels_fold)
    recall = recall_score(test_labels_fold, predicted_labels_fold)
    f1 = f1_score(test_labels_fold, predicted_labels_fold)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}")

# Calculate average accuracy and loss across all folds
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recalls = np.mean(recalls)
average_f1_scores = np.mean(f1_scores)

print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_accuracy:.4f}")
print(f"Average Recalls: {average_accuracy:.4f}")
print(f"Average F1 Scores: {average_accuracy:.4f}")


Fold 1
Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 33ms/step - accuracy: 0.5389 - loss: 0.6902
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.5672 - loss: 0.6787
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.5914 - loss: 0.6673
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.5994 - loss: 0.6642
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.6031 - loss: 0.6530
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.6327 - loss: 0.6429
Epoch 7/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.6556 - loss: 0.6271
Epoch 8/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.6828 - loss: 0.5976
Epoch 9/20
[1m250/250[