In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


ModuleNotFoundError: No module named 'numpy'

In [None]:
def load_data(file_path):
    sequences = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                _, seq_label = line.split(': ')
                sequence, label = seq_label.split()
                sequences.append(sequence)
                labels.append(int(label))
    return sequences, labels

train_sequences, train_labels = load_data('C:\\Users\\abu11\\Desktop\\train.txt')
valid_sequences, valid_labels = load_data('C:\\Users\\abu11\\Desktop\\valid.txt')
test_sequences, test_labels = load_data('C:\\Users\\abu11\\Desktop\\test.txt')

In [None]:
all_sequences = train_sequences + valid_sequences + test_sequences
encoder = LabelEncoder()
encoder.fit(list('ACDEFGHIKLMNPQRSTVWYX'))
encoded_sequences = [np.array(encoder.transform(list(seq))) for seq in all_sequences]

In [None]:
max_len = max(len(seq) for seq in encoded_sequences)

In [None]:
split_1 = len(train_sequences)
split_2 = split_1 + len(valid_sequences)

In [None]:
train_X = encoded_sequences[:split_1]
valid_X = encoded_sequences[split_1:split_2]
test_X = encoded_sequences[split_2:]

In [None]:
train_y = np.array(train_labels)
valid_y = np.array(valid_labels)
test_y = np.array(test_labels)

In [None]:
class ProteinDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

In [None]:
train_dataset = ProteinDataset(train_X, train_y)
valid_dataset = ProteinDataset(valid_X, valid_y)
test_dataset = ProteinDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
import torch.nn.functional as F

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, num_layers=1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        dropped = self.dropout(hidden[-1])
        out = self.fc(dropped)
        return out

# Model parameters
embedding_dim = 20
hidden_dim = 128
dropout = 0.5
output_dim = 1
vocab_size = len(encoder.classes_)
model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim, dropout)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, min_lr=1e-6, verbose=True)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

def early_stopping(valid_loss, list_of_prev_losses, patience=5):
    if not list_of_prev_losses:
        return False
    if valid_loss > max(list_of_prev_losses[-patience:]):
        return True
    return False

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, data_loader):
    model.eval()
    true_labels = []
    predictions = []
    
    with torch.no_grad():
        for sequences, labels in data_loader:
            outputs = model(sequences).squeeze()
            predictions += torch.sigmoid(outputs).tolist()
            true_labels += labels.tolist()
            
    predictions = [1 if pred >= 0.5 else 0 for pred in predictions]
    
    accuracy = accuracy_score(true_labels, predictions)
    auc = roc_auc_score(true_labels, [pred for pred in predictions])
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    return accuracy, auc, precision, recall, f1




def train_and_evaluate(model, train_loader, valid_loader, test_loader, criterion, optimizer, epochs=500):
    valid_loss_history = []
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            train_losses.append(loss.item())
        
        train_accuracy, train_auc , train_precision , train_recall , train_f1 = evaluate(model, train_loader)
        valid_accuracy, valid_auc , valid_precision , valid_recall , valid_f1 = evaluate(model, valid_loader)
        valid_loss_history.append(np.mean(train_losses))
        
        if early_stopping(valid_accuracy, valid_loss_history):
            print("Early stopping triggered")
            break
        
        print(f'Epoch {epoch+1}, Train Loss: {np.mean(train_losses):.4f}, Train Acc: {train_accuracy:.4f}, Train AUC: {train_auc:.4f}, Valid Acc: {valid_accuracy:.4f}, Valid AUC: {valid_auc:.4f}')
        
    test_accuracy, test_auc , test_precision , test_recall , test_f1  = evaluate(model, test_loader)
    print(f'Test Acc: {test_accuracy:.4f}, Test AUC: {test_auc:.4f} , Test Precision: {test_precision:.4f} , Test Recall: {test_recall:.4f} , Test F1_Score: {test_f1:.4f}' )


train_and_evaluate(model, train_loader, valid_loader, test_loader, criterion, optimizer)

Epoch 1, Train Loss: 0.6729, Train Acc: 0.6556, Train AUC: 0.6383, Valid Acc: 0.5556, Valid AUC: 0.5618
Epoch 2, Train Loss: 0.6268, Train Acc: 0.7109, Train AUC: 0.7086, Valid Acc: 0.5812, Valid AUC: 0.5842
Epoch 3, Train Loss: 0.6070, Train Acc: 0.7238, Train AUC: 0.7188, Valid Acc: 0.6410, Valid AUC: 0.6439
Epoch 4, Train Loss: 0.5724, Train Acc: 0.7164, Train AUC: 0.7049, Valid Acc: 0.6581, Valid AUC: 0.6636
Epoch 5, Train Loss: 0.5717, Train Acc: 0.7366, Train AUC: 0.7257, Valid Acc: 0.6068, Valid AUC: 0.6123
Early stopping triggered
Test Acc: 0.6923, Test AUC: 0.6904 , Test Precision: 0.6038 , Test Recall: 0.6809 , Test F1_Score: 0.6400


In [None]:
from sklearn.model_selection import ParameterGrid
param_grid = {
    'embedding_dim': [10 ,15, 20, 25, 30, 35 , 40 , 45 , 50 , 55 , 60 , 65 , 70],
    'hidden_dim': [10 , 12 , 14 , 16 , 18 , 20 , 22 , 24 , 26 , 28 , 30 , 32 , 34 , 36, 38 , 40 , 42 , 44 , 46 , 48 ,50 , 52, 64 , 128],
    'dropout': [0.1 , 0.2 ,0.3 ,0.4 , 0.5, 0.6 , 0.7, 0.8 , 0.9],
    'lr': [0.001, 0.002, 0.003 , 0.004 , 0.005 , 0.006 , 0.007 , 0.008 , 0.009]
}

best_accuracy = 0
best_params = None

for params in ParameterGrid(param_grid):
    print("Testing params:", params)
    embedding_dim = params['embedding_dim']
    hidden_dim = params['hidden_dim']
    dropout = params['dropout']
    lr = params['lr']
    
    model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim, dropout)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()
    
    train_and_evaluate(model, train_loader, valid_loader, test_loader, criterion, optimizer)
    
    
    _, _, _, _, valid_accuracy = evaluate(model, valid_loader)
    
    if valid_accuracy > best_accuracy:
        best_accuracy = valid_accuracy
        best_params = params

print("Best accuracy:", best_accuracy)
print("Best params:", best_params)

Testing params: {'dropout': 0.1, 'embedding_dim': 10, 'hidden_dim': 10, 'lr': 0.001}
Epoch 1, Train Loss: 0.7108, Train Acc: 0.5764, Train AUC: 0.5582, Valid Acc: 0.4957, Valid AUC: 0.5039
Epoch 2, Train Loss: 0.6793, Train Acc: 0.5948, Train AUC: 0.5703, Valid Acc: 0.5556, Valid AUC: 0.5658
Epoch 3, Train Loss: 0.6710, Train Acc: 0.6280, Train AUC: 0.6074, Valid Acc: 0.5726, Valid AUC: 0.5820
Epoch 4, Train Loss: 0.6614, Train Acc: 0.6225, Train AUC: 0.6053, Valid Acc: 0.5556, Valid AUC: 0.5640
Epoch 5, Train Loss: 0.6566, Train Acc: 0.6262, Train AUC: 0.6113, Valid Acc: 0.5641, Valid AUC: 0.5711
Epoch 6, Train Loss: 0.6540, Train Acc: 0.6409, Train AUC: 0.6279, Valid Acc: 0.5470, Valid AUC: 0.5535
Epoch 7, Train Loss: 0.6444, Train Acc: 0.6446, Train AUC: 0.6319, Valid Acc: 0.5470, Valid AUC: 0.5531
Epoch 8, Train Loss: 0.6405, Train Acc: 0.6519, Train AUC: 0.6399, Valid Acc: 0.5556, Valid AUC: 0.5614
Epoch 9, Train Loss: 0.6361, Train Acc: 0.6519, Train AUC: 0.6399, Valid Acc: 0.564

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1, Train Loss: 0.6985, Train Acc: 0.5322, Train AUC: 0.4938, Valid Acc: 0.4872, Valid AUC: 0.5000
Epoch 2, Train Loss: 0.6823, Train Acc: 0.5783, Train AUC: 0.5488, Valid Acc: 0.5043, Valid AUC: 0.5158
Epoch 3, Train Loss: 0.6678, Train Acc: 0.6133, Train AUC: 0.5912, Valid Acc: 0.4957, Valid AUC: 0.5048
Epoch 4, Train Loss: 0.6565, Train Acc: 0.6225, Train AUC: 0.6032, Valid Acc: 0.4957, Valid AUC: 0.5031
Epoch 5, Train Loss: 0.6469, Train Acc: 0.6409, Train AUC: 0.6244, Valid Acc: 0.5299, Valid AUC: 0.5360
Epoch 6, Train Loss: 0.6370, Train Acc: 0.6556, Train AUC: 0.6433, Valid Acc: 0.5299, Valid AUC: 0.5351
Epoch 7, Train Loss: 0.6293, Train Acc: 0.6630, Train AUC: 0.6525, Valid Acc: 0.5128, Valid AUC: 0.5180
Epoch 8, Train Loss: 0.6187, Train Acc: 0.6777, Train AUC: 0.6714, Valid Acc: 0.5385, Valid AUC: 0.5430
Epoch 9, Train Loss: 0.6108, Train Acc: 0.6740, Train AUC: 0.6657, Valid Acc: 0.5470, Valid AUC: 0.5522
Epoch 10, Train Loss: 0.6015, Train Acc: 0.6980, Train AUC: 0.69