In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = '/content/drive/MyDrive/Dataset/'

In [1]:
import pandas as pd

In [2]:
positive_df = pd.read_csv('positive_sample_7500_1.csv', header=None)
negative_df = pd.read_csv('negative_sample_7500_1.csv', header=None)

In [3]:
def load_protein_sequences(file_path):
    protein_dict = {}
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                protein_dict[parts[0]] = parts[1]
    return protein_dict
protein_sequences = load_protein_sequences('protein_sequences.txt')

In [4]:
positive_df_2 = pd.DataFrame([positive_df[1],positive_df[0]]).transpose()
negative_df_2 = pd.DataFrame([negative_df[1],negative_df[0]]).transpose()

In [5]:
positive_df = pd.concat([positive_df,positive_df_2])
negative_df = pd.concat([negative_df,negative_df_2])

In [6]:
positive_df[2] = 1
negative_df[2] = 0
data = pd.concat([positive_df, negative_df],ignore_index=True)

In [7]:
data = data.sample(frac=1,random_state=42).reset_index(drop=True)

In [8]:
data[0] = [protein_sequences[x] for x in data[0]]
data[1] = [protein_sequences[x] for x in data[1]]

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
sequences1 = data[0].values  # Amino acid sequences of protein 1
sequences2 = data[1].values  # Amino acid sequences of protein 2
labels = data[2].values     # Interaction labels (0 or 1)

In [11]:
tokenizer = Tokenizer(char_level=True)  # Tokenize at character level
tokenizer.fit_on_texts(sequences1 + sequences2)

# Convert sequences to numerical tokens
encoded_sequences1 = tokenizer.texts_to_sequences(sequences1)
encoded_sequences2 = tokenizer.texts_to_sequences(sequences2)

# Pad sequences to a fixed length (choose an appropriate maxlen)
maxlen = 250  # Example max length, adjust as needed
padded_sequences1 = pad_sequences(encoded_sequences1, maxlen=maxlen, padding='post')
padded_sequences2 = pad_sequences(encoded_sequences2, maxlen=maxlen, padding='post')

In [12]:
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size for embedding layer

# Input layers for protein sequences
input1 = Input(shape=(maxlen,))
input2 = Input(shape=(maxlen,))

# Embedding layers to represent amino acids as vectors
embedding_layer = Embedding(vocab_size, 128)  # 128-dimensional embeddings
embedded_sequences1 = embedding_layer(input1)
embedded_sequences2 = embedding_layer(input2)

# LSTM layers with additional hidden layers
lstm_layer1 = LSTM(64, return_sequences=True)  # First LSTM layer with return_sequences=True
lstm_layer2 = LSTM(64, return_sequences=True)  # Second LSTM layer with return_sequences=True
lstm_layer3 = LSTM(64)  # Third LSTM layer

lstm_output1 = lstm_layer1(embedded_sequences1)
lstm_output1 = lstm_layer2(lstm_output1)  # Pass output of first LSTM to the second
lstm_output1 = lstm_layer3(lstm_output1)  # Pass output of second LSTM to the third

lstm_output2 = lstm_layer1(embedded_sequences2)
lstm_output2 = lstm_layer2(lstm_output2)  # Pass output of first LSTM to the second
lstm_output2 = lstm_layer3(lstm_output2)  # Pass output of second LSTM to the third

# Concatenate LSTM outputs
merged = concatenate([lstm_output1, lstm_output2])

# Dense layers for classification
output = Dense(1, activation='sigmoid')(merged)

# Create the model
model = Model(inputs=[input1, input2], outputs=output)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the number of folds
k = 5  # You can adjust this value

# Create KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Shuffle and set random_state for reproducibility

# Lists to store evaluation results
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Iterate through folds
for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences1)):  # Assuming padded_sequences1 and padded_sequences2 have the same length
    print(f"Fold {fold + 1}")

    # Split data into training and testing sets for this fold
    train_sequences1_fold, test_sequences1_fold = padded_sequences1[train_index], padded_sequences1[test_index]
    train_sequences2_fold, test_sequences2_fold = padded_sequences2[train_index], padded_sequences2[test_index]
    train_labels_fold, test_labels_fold = labels[train_index], labels[test_index]

    # Train the model on this fold's training data
    model.fit([train_sequences1_fold, train_sequences2_fold], train_labels_fold, epochs=100, batch_size=2500)

    # Evaluate the model on this fold's testing data
    predictions = model.predict([test_sequences1_fold, test_sequences2_fold])
    predicted_labels_fold = (predictions > 0.5).astype(int)

    accuracy = accuracy_score(test_labels_fold, predicted_labels_fold)
    precision = precision_score(test_labels_fold, predicted_labels_fold)
    recall = recall_score(test_labels_fold, predicted_labels_fold)
    f1 = f1_score(test_labels_fold, predicted_labels_fold)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}")

Fold 1
Epoch 1/100
[1m 8/10[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m23s[0m 12s/step - accuracy: 0.5159 - loss: 0.6928

In [21]:
# Calculate average accuracy and loss across all folds
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recalls = np.mean(recalls)
average_f1_scores = np.mean(f1_scores)

print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recalls: {average_recalls:.4f}")
print(f"Average F1 Scores: {average_f1_scores:.4f}")
model.save('5fold_3layer.keras')

Average Accuracy: 0.9452
Average Precision: 0.9463
Average Recalls: 0.9442
Average F1 Scores: 0.9452


In [22]:
from tensorflow.keras.models import load_model


# Load the model

loaded_model = load_model('protein_interaction_model.h5')


# Verify that the loaded model works

loss, accuracy = loaded_model.evaluate([padded_sequences1, padded_sequences2], labels)

print(f'Loaded model accuracy: {accuracy:.4f}')



[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.9963 - loss: 0.0157
Loaded model accuracy: 0.9961


In [29]:
import pandas as pd
from joblib import dump, load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, f1_score

# Load positive and negative data
positive_df = pd.read_csv('positive_2500.csv', header=None)
negative_df = pd.read_csv('negative_2500.csv', header=None)

def load_protein_sequences(file_path):
    protein_dict = {}
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                protein_dict[parts[0]] = parts[1]
    return protein_dict

# Load protein sequences
protein_sequences = load_protein_sequences('protein_sequences.txt')

# Prepare the data
positive_df_2 = pd.DataFrame([positive_df[1], positive_df[0]]).transpose()
negative_df_2 = pd.DataFrame([negative_df[1], negative_df[0]]).transpose()
positive_df = pd.concat([positive_df, positive_df_2])
negative_df = pd.concat([negative_df, negative_df_2])

positive_df[2] = 1
negative_df[2] = 0
data = pd.concat([positive_df, negative_df], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

data[0] = [protein_sequences[x] for x in data[0]]
data[1] = [protein_sequences[x] for x in data[1]]

sequences1 = data[0].values  # Amino acid sequences of protein 1
sequences2 = data[1].values  # Amino acid sequences of protein 2
labels = data[2].values       # Interaction labels (0 or 1)

# Tokenization
tokenizer = Tokenizer(char_level=True)  # Tokenize at character level
tokenizer.fit_on_texts(sequences1 + sequences2)

# Convert sequences to numerical tokens
encoded_sequences1 = tokenizer.texts_to_sequences(sequences1)
encoded_sequences2 = tokenizer.texts_to_sequences(sequences2)

# Pad sequences to a fixed length
maxlen = 100  # Example max length, adjust as needed
test_sequences1 = pad_sequences(encoded_sequences1, maxlen=maxlen, padding='post')
test_sequences2 = pad_sequences(encoded_sequences2, maxlen=maxlen, padding='post')

# Load the model
loaded_model = load('5fold_3layer_PPI_Model.joblib')

# Evaluate the model
loss, accuracy = loaded_model.evaluate([test_sequences1, test_sequences2], labels)

# Make predictions
predictions = loaded_model.predict([test_sequences1, test_sequences2])
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Calculate precision, recall, and F1 score
precision = precision_score(labels, predicted_labels)
recall = recall_score(labels, predicted_labels)
f1 = f1_score(labels, predicted_labels)

# Print the results
print(f'Loaded model accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


  saveable.load_own_variables(weights_store.get(inner_path))


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.7216 - loss: 2.1635
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step
Loaded model accuracy: 0.7207
Precision: 0.7168
Recall: 0.7297
F1 Score: 0.7232
