In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("data.csv") 
df.head(3)

Unnamed: 0,chain_id,first_res,input,dssp8
0,4trtA,1,VMKANVTKKTLNEGLGLLERVIPSRSSNPLLTALKVETSEGGLTLS...,CEEEEEEHHHHHHHHHHHHHHSCSCCSSTTTTEEEEEECSSEEEEE...
1,5g0mA,19,PVRVGLSVDASALGHTIPPDYTGLSYEQAQMANPNYFSGANTQLAG...,CEEEEEEEEEEEEEEEEPTTCCEEEEEGGGGGCTTTSSTTCHHHHH...
2,5l6mF,2,AYVLDTNVAIHLRDGDPEVTTRVTALNGAILLSIISRVELEGGVYR...,CEEECHHHHHHHHTTCHHHHHHHHHCCSCEEEEHHHHHHHHHHHTS...


# Training the model on the PS4 dataset

In [7]:
def dssp8_to_dssp3(seq):
    mapping = {
        'H': 'H', 'G': 'H', 'I': 'H',
        'E': 'E', 'B': 'E',
        'T': 'C', 'S': 'C', '-': 'C'
    }
    return ''.join([mapping.get(c, 'C') for c in seq])

df['dssp3'] = df['dssp8'].apply(dssp8_to_dssp3)

In [8]:
df.head(3)

Unnamed: 0,chain_id,first_res,input,dssp8,dssp3
0,4trtA,1,VMKANVTKKTLNEGLGLLERVIPSRSSNPLLTALKVETSEGGLTLS...,CEEEEEEHHHHHHHHHHHHHHSCSCCSSTTTTEEEEEECSSEEEEE...,CEEEEEEHHHHHHHHHHHHHHCCCCCCCCCCCEEEEEECCCEEEEE...
1,5g0mA,19,PVRVGLSVDASALGHTIPPDYTGLSYEQAQMANPNYFSGANTQLAG...,CEEEEEEEEEEEEEEEEPTTCCEEEEEGGGGGCTTTSSTTCHHHHH...,CEEEEEEEEEEEEEEEECCCCCEEEEEHHHHHCCCCCCCCCHHHHH...
2,5l6mF,2,AYVLDTNVAIHLRDGDPEVTTRVTALNGAILLSIISRVELEGGVYR...,CEEECHHHHHHHHTTCHHHHHHHHHCCSCEEEEHHHHHHHHHHHTS...,CEEECHHHHHHHHCCCHHHHHHHHHCCCCEEEEHHHHHHHHHHHCC...


In [None]:
from sklearn.model_selection import train_test_split


aa_vocab = {aa: i+1 for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}  # +1 for padding
ss_vocab = {'H': 0, 'E': 1, 'C': 2}

def encode_seq(seq, vocab):
    return [vocab.get(a, 0) for a in seq]


df['X'] = df['input'].apply(lambda x: encode_seq(x, aa_vocab))
df['y'] = df['dssp3'].apply(lambda x: encode_seq(x, ss_vocab))


max_len = max(df['X'].apply(len))

def pad(seq, max_len, pad_value=0):
    return seq + [pad_value]*(max_len - len(seq))

X = np.array([pad(x, max_len) for x in df['X']])
y = np.array([pad(y_, max_len, pad_value=-100) for y_ in df['y']])  # -100 for ignored loss


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=32, hidden_dim=64, output_dim=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM(vocab_size=len(aa_vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_data = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_data = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

for epoch in range(5):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        out = out.view(-1, 3)
        yb = yb.view(-1)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Train Loss = {total_loss:.4f}")


Epoch 1: Train Loss = 434.4635
Epoch 2: Train Loss = 390.9137
Epoch 3: Train Loss = 375.3925
Epoch 4: Train Loss = 370.7283
Epoch 5: Train Loss = 366.6796


In [14]:
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for xb, yb in val_loader:
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb)
        preds = torch.argmax(out, dim=2).cpu().numpy()
        labels = yb.cpu().numpy()
        for p, l in zip(preds, labels):
            for pi, li in zip(p, l):
                if li != -100:
                    all_preds.append(pi)
                    all_labels.append(li)

print(classification_report(all_labels, all_preds, target_names=["H", "E", "C"]))


              precision    recall  f1-score   support

           H       0.76      0.71      0.74    149739
           E       0.65      0.58      0.61     93089
           C       0.67      0.75      0.71    166535

    accuracy                           0.70    409363
   macro avg       0.69      0.68      0.69    409363
weighted avg       0.70      0.70      0.70    409363



In [22]:
torch.save(model.state_dict(), "model.pth")

# Testing the model on the CB513 dataset

In [28]:
import torch
from torch.utils.data import DataLoader, Dataset

In [23]:
cb_data = pd.read_csv("CB513_ps4_style.csv")

In [25]:
def dssp8_to_dssp3(seq):
    mapping = {
        'H': 'H', 'G': 'H', 'I': 'H',
        'E': 'E', 'B': 'E',
        'T': 'C', 'S': 'C', '-': 'C'
    }
    return ''.join([mapping.get(c, 'C') for c in seq])

In [26]:
aa_vocab = {aa: i+1 for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}  # +1 to reserve 0 for padding
ss_vocab = {'H': 0, 'E': 1, 'C': 2}

In [29]:
class ProteinDataset(Dataset):
    def __init__(self, df):
        self.X = [self.encode_aa(seq) for seq in df['input']]
        self.y = [self.encode_ss(dssp8_to_dssp3(ss)) for ss in df['dssp8']]
        self.max_len = max(len(x) for x in self.X)

    def encode_aa(self, seq):
        return [aa_vocab.get(aa, 0) for aa in seq]

    def encode_ss(self, ss):
        return [ss_vocab.get(s, 2) for s in ss]  # default to 'C'

    def pad(self, seq, length, pad_val):
        return seq + [pad_val] * (length - len(seq))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.pad(self.X[idx], self.max_len, 0)
        y = self.pad(self.y[idx], self.max_len, -100)
        return torch.tensor(x), torch.tensor(y)


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM(vocab_size=len(aa_vocab))  # Replace with actual model class
model.load_state_dict(torch.load("model.pth"))
model.to(device)
model.eval()

BiLSTM(
  (embedding): Embedding(21, 32, padding_idx=0)
  (lstm): LSTM(32, 64, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [32]:
test_dataset = ProteinDataset(cb_data)
test_loader = DataLoader(test_dataset, batch_size=32)


In [33]:
all_preds, all_labels = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb)
        preds = torch.argmax(out, dim=2).cpu().numpy()
        labels = yb.cpu().numpy()

        for p, l in zip(preds, labels):
            for pi, li in zip(p, l):
                if li != -100:
                    all_preds.append(pi)
                    all_labels.append(li)

In [34]:
print(classification_report(all_labels, all_preds, target_names=["H", "E", "C"]))

              precision    recall  f1-score   support

           H       0.74      0.70      0.72     49006
           E       0.62      0.57      0.60     31921
           C       0.68      0.74      0.71     63084

    accuracy                           0.69    144011
   macro avg       0.68      0.67      0.68    144011
weighted avg       0.69      0.69      0.69    144011

