In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
import numpy as np


aa_to_int = {'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'E':6, 'Q':7, 'G':8, 'H':9, 'I':10, 
             'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 'S':16, 'T':17, 'W':18, 'Y':19, 
             'V':20, 'U':21, 'X':22}


def encode_sequence(seq, max_length):
    encoded_seq = [aa_to_int.get(aa, 22) for aa in seq]  
    padding = [0] * (max_length - len(encoded_seq))  
    return torch.tensor(encoded_seq + padding, dtype=torch.long)

class ProteinDataset(Dataset):
    def __init__(self, positive_file, negative_file, max_length):
        self.sequences = []
        self.labels = []
        self.max_length = max_length
        
        
        with open(positive_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(1)  

        
        with open(negative_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(0)  

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]


max_length = 1000


positive_file = r""
negative_file = r""
dataset = ProteinDataset(positive_file, negative_file, max_length)


data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

class CNNLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes):
        super(CNNLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, cnn_filters, kernel_size=20)
        self.lstm = nn.LSTM(cnn_filters, lstm_hidden, batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x, _ = self.lstm(x.permute(0, 2, 1))
        x = self.fc(x[:, -1, :])
        return torch.sigmoid(x)


cnn_filters = 64
lstm_hidden = 128
embedding_dim = 8
vocab_size = len(aa_to_int) + 1  
num_classes = 1  

model = CNNLSTM(vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=1e-4)


num_epochs = 30  

for epoch in range(num_epochs):
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device, dtype=torch.float32)
        
        
        outputs = model(inputs).squeeze()  
        loss = criterion(outputs, targets)
        
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

torch.save(model.state_dict(), 'model.pth')