In [1]:
from itertools import cycle, islice
import os
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn.preprocessing import LabelBinarizer

from classifier_rnn import RNN

In [5]:
class CustomDataset(Dataset):
    def __init__(self, folder_path, split):
        self.folder_path = folder_path
        self.split = split
        self.data = self.load_data()
        
        self.label_encoding = {'I': 0, 'O': 1, 'P': 2, 'S': 3, 'M': 4, 'B': 5}
        self.one_hot = LabelBinarizer()
        self.one_hot.fit(list(self.label_encoding.values()))

    def load_data(self):
        file_list = [f for f in os.listdir(self.folder_path) if f.endswith('.npy')]
        file_list.sort()  # Make sure the order is consistent

        data = []
        for file_name in file_list:
            file_path = os.path.join(self.folder_path, file_name)
            data.append(np.load(file_path, allow_pickle=True).item())

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        #print(sample['data'])
        inputs = sample['data']
        labels_str = sample['labels']

        labels_list = [self.label_encoding[label] for label in labels_str]
        labels_list = self.one_hot.transform(labels_list)
        
        labels_tensor = torch.tensor(labels_list, dtype=torch.float)

        return {'data': inputs, 'labels': labels_tensor}

def collate_fn(batch):
    # sort the batch by sequence length in descending order
    batch = sorted(batch, key=lambda x: len(x['data']), reverse=True)
    
    # pad sequences for data
    data = [torch.tensor(sample['data']) for sample in batch]
    padded_data = pad_sequence(data, batch_first=True)

    # Pad sequences for labels
    labels = [torch.tensor(sample['labels']) for sample in batch]
    padded_labels = pad_sequence(labels, batch_first=True)
    
    # Pack the padded sequences for data
    lengths = [len(seq) for seq in data]
    #packed_data = pack_padded_sequence(padded_data, lengths=lengths, batch_first=True, enforce_sorted=True)

    return {'data': padded_data, 'labels': padded_labels, "lengths": lengths} 

In [20]:
# config
k_folds = 5
num_epochs = 50
loss_function = nn.CrossEntropyLoss()
lr = 1e-3

# define models to be analyzed
model_rnn = RNN(512, 300, 6)
optimizer = optim.Adam(model_rnn.parameters(), lr = lr)

# set fixed random number seed
torch.manual_seed(42)

# create cv's corresponding to deeptmhmm's
cvs = list(range(5))
kfolds = []
for idx, split in enumerate(range(5)):
    
    # make cycling list and define train/val/test splits
    idxs = np.asarray(list(islice(cycle(cvs), idx, idx + 5)))
    train_idx, val_idx, test_idx = idxs[:3], idxs[3], idxs[4]
    
    kfolds.append((train_idx, val_idx, test_idx))

# make on big concatenated dataset of all splits
data_cvs = np.squeeze([CustomDataset(os.path.join("encoder_proteins_test", folder), 'train') for folder in ['cv0', 'cv1', 'cv2', 'cv3' , 'cv4']])

gen_train_loss = []
gen_acc = []

# k-fold cross validation
for fold, (train_ids, val_id, test_id) in enumerate(kfolds):    
    print(f'FOLD {fold + 1}')
    print('--------------------------------')
    
    # concatenates the data from the different cv's
    training_data = np.concatenate(data_cvs[train_ids], axis = 0)
    
    # define data loaders for train/val/test data in this fold (collate 0 pads for same-length)
    trainloader = DataLoader(
                        training_data, batch_size=32, shuffle=True, collate_fn=collate_fn)

    valloader = DataLoader(data_cvs[val_id])
    testloader = DataLoader(data_cvs[test_id])
    
    # training
    step = 0
    model_rnn.train()

    train_loss = []
    train_accuracies = []
    
    valid_loss = []
    valid_accuracies = []
    
    for epoch in range(num_epochs):
        
        for batch_idx, batch in enumerate(trainloader):
            inputs, labels, lengths = batch['data'], batch['labels'], batch['lengths']

            # receive output from rnn
            output = model_rnn(inputs)  

            loss = 0
            for l in range(output.shape[0]):
                # masking the zero-padded outputs
                batch_output = output[l][:lengths[l]]
                batch_labels = labels[l][:lengths[l]]
                
                # compute cross-entropy loss
                loss += loss_function(batch_output, batch_labels)
                
            train_loss.append(loss)
            
            # gradient update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()  
            
            step += 1
            
            predictions = output.max(-1)[1]
        
        if epoch % 10 == 0:
            print(train_loss[-1])
        
    #gen_train_loss.append(np.mean(train_loss.detach.numpy()))        

#print(gen_train_loss)

FOLD 1
--------------------------------


  labels = [torch.tensor(sample['labels']) for sample in batch]


tensor(16.1144, grad_fn=<AddBackward0>)
tensor(13.0598, grad_fn=<AddBackward0>)
tensor(11.0948, grad_fn=<AddBackward0>)
tensor(10.8192, grad_fn=<AddBackward0>)
tensor(10.6535, grad_fn=<AddBackward0>)


AttributeError: 'list' object has no attribute 'detach'