In [None]:
# Cell 1: Install dependencies
!pip install transformers torch --quiet


In [None]:
# Cell 2: Import libraries
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
# Cell 3: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ChatterjeeLab/MeMDLM")
model = AutoModel.from_pretrained("ChatterjeeLab/MeMDLM")
model.eval()  # freeze base model


In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

from datasets import load_dataset, concatenate_datasets
from datasets import Dataset

data_path = "/content/drive/MyDrive/Final Year Project/FYP 2/Dataset"

Kinases= load_dataset('csv', data_files={'Kinases': f'{data_path}/Protein_Kinases.csv'})
Phosphatases = load_dataset('csv', data_files={'Phosphatases': f'{data_path}/Protein_Phosphatases.csv'})


In [None]:
kinases_dataset = Kinases['Kinases']
kinases_dataset = kinases_dataset.shuffle(seed=42).select(range(100))
phospho_dataset = Phosphatases['Phosphatases']
phospho_dataset= phospho_dataset.shuffle(seed=42).select(range(100))
print(len(kinases_dataset))
kinases_dataset=kinases_dataset.to_pandas()
phospho_dataset=phospho_dataset.to_pandas()
kinases_dataset['label']=0
phospho_dataset['label']=1

In [None]:
kinases_dataset=kinases_dataset[['Sequence','label']]
phospho_dataset=phospho_dataset[['Sequence','label']]

In [None]:
import pandas as pd

data = pd.concat([kinases_dataset, phospho_dataset]).reset_index(drop=True)

In [None]:
sequences = data['Sequence'].tolist()
labels = data['label'].tolist()
labels = torch.tensor(labels)

In [None]:
def get_embedding(seq):
    inputs = tokenizer(seq, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)
        embedding = output.last_hidden_state.mean(dim=1)
    return embedding.squeeze(0)

embeddings = torch.stack([get_embedding(seq) for seq in sequences])


In [None]:
# Cell 6: Define simple classifier
class ProteinClassifier(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.fc(x))

classifier = ProteinClassifier(hidden_dim=embeddings.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters(), lr=1e-3)


In [None]:
from sklearn.model_selection import train_test_split

# First, split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(
    embeddings, labels, test_size=0.3, random_state=42
)

# Then split train+val into train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.20, random_state=42  # 0.25 x 0.8 = 0.2
)

In [None]:
# Cell 8: Train the classifier and track both training & validation loss
train_losses = []
val_losses = []

for epoch in range(100):
    # Training
    classifier.train()
    optimizer.zero_grad()
    outputs = classifier(X_train).squeeze()
    train_loss = criterion(outputs, y_train.float())
    train_loss.backward()
    optimizer.step()

    # Validation
    classifier.eval()
    with torch.no_grad():
        val_outputs = classifier(X_val).squeeze()
        val_loss = criterion(val_outputs, y_val.float())

    train_losses.append(train_loss.item())
    val_losses.append(val_loss.item())

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Train Loss: {train_loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


In [None]:
# Cell 9: Plot training and validation loss
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
print(len(X_train[0]))

In [None]:
from sklearn.metrics import accuracy_score
# Cell 8: Evaluate on one test sequence
classifier.eval()
with torch.no_grad():
    test_output = classifier(X_test).squeeze()
    predicted = (test_output > 0.5).int()
    print("True Label:", y_test.tolist())
    print("Predicted:", predicted.tolist())
    accuracy = accuracy_score(y_test, predicted)

print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Cell 9: Predict label for a new protein sequence
new_sequence = "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEHIEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTVTSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDSLKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRKTFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPIPQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQRDRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGAPLNQLMRCLRKYQSRTPSPLLHSVPSEIVFDFEPGPVFRGSTTGLSATPPASLPGSLTNVKALQKSPGPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDVAVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHHLHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATVKSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNINNRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARSLPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGEFAAFK"  # you can change this to any sequence
new_embedding = get_embedding(new_sequence)

classifier.eval()
with torch.no_grad():
    prediction = classifier(new_embedding.unsqueeze(0)).item()
    predicted_label = int(prediction > 0.5)

print(f"New Sequence: {new_sequence}")
print(f"Predicted Score: {prediction:.4f}")
print(f"Predicted Label: {predicted_label}")


In [None]:
torch.save(classifier.state_dict(), "/content/drive/MyDrive/Final Year Project/FYP 2/Implementation/ChatterjeeLab MeMDLM/model.pt")


In [None]:
# Define same architecture as before
class ProteinClassifier(nn.Module):
    def __init__(self, input_dim):
        super(ProteinClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.fc(x))

# Initialize with correct input dimension
classifier = ProteinClassifier(input_dim=embeddings.shape[1])

# Load weights
classifier.load_state_dict(torch.load("/content/drive/MyDrive/Final Year Project/FYP 2/Implementation/ChatterjeeLab MeMDLM/model.pt"))
classifier.eval()

# Evaluate on test data
with torch.no_grad():
    test_output = classifier(X_test).squeeze()
    predicted = (test_output > 0.5).int()
    print("True Labels:", y_test.tolist())
    print("Predicted:", predicted.tolist())
    accuracy = accuracy_score(y_test, predicted)

print(f"Test Accuracy: {accuracy * 100:.2f}%")
