In [1]:
from itertools import cycle, islice
import os
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn.preprocessing import LabelBinarizer

from classifier_rnn import RNN

In [5]:
class CustomDataset(Dataset):
    def __init__(self, folder_path, split):
        self.folder_path = folder_path
        self.split = split
        self.data = self.load_data()
        
        self.label_encoding = {'I': 0, 'O': 1, 'P': 2, 'S': 3, 'M': 4, 'B': 5}
        self.one_hot = LabelBinarizer()
        self.one_hot.fit(list(self.label_encoding.values()))

    def load_data(self):
        file_list = [f for f in os.listdir(self.folder_path) if f.endswith('.npy')]
        file_list.sort()  # Make sure the order is consistent

        data = []
        for file_name in file_list:
            file_path = os.path.join(self.folder_path, file_name)
            data.append(np.load(file_path, allow_pickle=True).item())

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        #print(sample['data'])
        inputs = sample['data']
        labels_str = sample['labels']

        labels_list = [self.label_encoding[label] for label in labels_str]
        labels_list = self.one_hot.transform(labels_list)
        
        labels_tensor = torch.tensor(labels_list, dtype=torch.float)

        return {'data': inputs, 'labels': labels_tensor}

def collate_fn(batch):
    # sort the batch by sequence length in descending order
    batch = sorted(batch, key=lambda x: len(x['data']), reverse=True)
    
    # pad sequences for data
    data = [torch.tensor(sample['data']) for sample in batch]
    padded_data = pad_sequence(data, batch_first=True)

    # Pad sequences for labels
    labels = [torch.tensor(sample['labels']) for sample in batch]
    padded_labels = pad_sequence(labels, batch_first=True)
    
    # Pack the padded sequences for data
    lengths = [len(seq) for seq in data]
    #packed_data = pack_padded_sequence(padded_data, lengths=lengths, batch_first=True, enforce_sorted=True)

    return {'data': padded_data, 'labels': padded_labels, "lengths": lengths} 

In [43]:
def train_nn(model, trainloader, valloader, loss_function, optimizer, num_epochs = 50, val_step = 100):
    # training
    step = 0
    model.train()
    
    train_loss = []
    train_accuracies = []
    valid_loss = []
    valid_accuracies = []
    
    for epoch in range(num_epochs):
        train_batch_loss = []
        train_batch_accuracies = []
        val_batch_loss = []
        val_batch_accuracies = []
        
        for batch_idx, batch in enumerate(trainloader):
            inputs, labels, lengths = batch['data'], batch['labels'], batch['lengths']

            # receive output from rnn
            output = model(inputs)  

            loss = 0
            for l in range(output.shape[0]):
                # masking the zero-padded outputs
                batch_output = output[l][:lengths[l]]
                batch_labels = labels[l][:lengths[l]]
                
                # compute cross-entropy loss
                loss += loss_function(batch_output, batch_labels)
            
            # receive final loss from current batch    
            train_batch_loss.append(loss.item())
            
            # gradient update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()  
            
            step += 1
            
            predictions = output.max(-1)[1]
            
            # consider only doing this once done with training (after epoch loop)
            if step % val_step == 0:
                # validate current model with hidden_size = param
                with torch.no_grad():
                    model.eval()
                    
                    for batch_idx, batch in enumerate(valloader):
                        inputs, labels, lengths = batch['data'], batch['labels'], batch['lengths']
                        output = model(inputs)
                        
                        loss = 0
                        for l in range(output.shape[0]):
                            # masking the zero-padded outputs
                            batch_output = output[l][:lengths[l]]
                            batch_labels = labels[l][:lengths[l]]
                            
                            # compute cross-entropy loss
                            loss += loss_function(batch_output, batch_labels)
                        
                        # receive validation loss from current batch
                        val_batch_loss.append(loss.item())
                        
                    model.train()
                
        # receive mean loss for this epoch
        train_loss.append(np.mean(train_batch_loss))
        valid_loss.append(np.mean(val_batch_loss))
        
        if epoch % 10 == 0:
            print("training loss: ", train_loss[-1], \
                "\t validation loss: ", valid_loss[-1])

    return train_loss, train_accuracies, valid_loss, valid_accuracies

In [44]:
# config
k_folds = 5
num_epochs = 5
loss_function = nn.CrossEntropyLoss()
lr = 1e-3
tuning = [100, 200, 300, 400]
val_step = 1

# set fixed random number seed
torch.manual_seed(42)

# create cv's corresponding to deeptmhmm's
cvs = list(range(5))
kfolds = []
for idx, split in enumerate(range(5)):
    
    # make cycling list and define train/val/test splits
    idxs = np.asarray(list(islice(cycle(cvs), idx, idx + 5)))
    train_idx, val_idx, test_idx = idxs[:3], idxs[3], idxs[4]
    
    kfolds.append((train_idx, val_idx, test_idx))

# make on big concatenated dataset of all splits
data_cvs = np.squeeze([CustomDataset(os.path.join("encoder_proteins_test", folder), 'train') for folder in ['cv0', 'cv1', 'cv2', 'cv3' , 'cv4']])

gen_train_loss = np.zeros((len(kfolds), num_epochs))
gen_train_acc = np.zeros((len(kfolds), num_epochs))
fold_test_loss = np.zeros((len(kfolds)))
fold_test_acc = np.zeros((len(kfolds)))

# k-fold cross validation
for fold, (train_ids, val_id, test_id) in enumerate(kfolds):    
    print(f'FOLD {fold + 1}')
    print('--------------------------------')
    
    # concatenates the data from the different cv's
    training_data = np.concatenate(data_cvs[train_ids], axis = 0)
    
    # define data loaders for train/val/test data in this fold (collate 0 pads for same-length)
    trainloader = DataLoader(
                        training_data, batch_size=32, shuffle=True, collate_fn=collate_fn)

    valloader = DataLoader(data_cvs[val_id], batch_size=32, shuffle=False, collate_fn=collate_fn)
    testloader = DataLoader(data_cvs[test_id], batch_size=32, shuffle=False, collate_fn=collate_fn)
    
    param_models = []
    
    train_loss_param = np.zeros((len(tuning), num_epochs))
    train_acc_param = np.zeros((len(tuning), num_epochs))
    val_loss_param = np.zeros((len(tuning), num_epochs))
    val_acc_param = np.zeros((len(tuning), num_epochs))
    
    # hyperparameter tune
    for idx, param in enumerate(tuning):
        print(f'\n HIDDEN_SIZE {param}')
        print('--------------------------------')
        
        # define models to be analyzed
        model_rnn = RNN(512, param, 6)
        optimizer = optim.Adam(model_rnn.parameters(), lr = lr)

        # train and validate model
        train_loss, train_accuracies, valid_loss, valid_accuracies = train_nn(
                                                                        model = model_rnn, 
                                                                        trainloader = trainloader, 
                                                                        valloader = valloader,
                                                                        loss_function = loss_function,
                                                                        optimizer = optimizer,
                                                                        num_epochs = num_epochs,
                                                                        val_step = val_step)
        
        # save models and losses
        param_models.append(model_rnn)   
        train_loss_param[idx] = train_loss
        val_loss_param[idx] = valid_loss   
    
    # test for the best model
    best_param_idx = val_loss_param[:, -1].argmin()
    best_model = param_models[best_param_idx]
    
    print(f"\nbest params for fold {fold + 1}: ", tuning[best_param_idx])  
    
    with torch.no_grad():
        best_model.eval()
        test_batch_loss = []
        
        for batch_idx, batch in enumerate(testloader):
            inputs, labels, lengths = batch['data'], batch['labels'], batch['lengths']
            
            output = best_model(inputs)
            
            loss = 0
            for l in range(output.shape[0]):
                # masking the zero-padded outputs
                batch_output = output[l][:lengths[l]]
                batch_labels = labels[l][:lengths[l]]
                
                # compute cross-entropy loss
                loss += loss_function(batch_output, batch_labels)
                
            # receive validation loss from current batch
            test_batch_loss.append(loss.item())
    
    best_model.train()
    
    # save the best training loss
    gen_train_loss[fold] = train_loss_param[best_param_idx]
    fold_test_loss[fold] = np.mean(test_batch_loss)
    
    print(f"test loss for fold {fold + 1}: ", fold_test_loss[fold], "\n")

# generalization loss
gen_test_loss = np.mean(fold_test_loss)

print("\n generalization test loss: ", gen_test_loss)

FOLD 1
--------------------------------

 HIDDEN_SIZE 100
--------------------------------


  labels = [torch.tensor(sample['labels']) for sample in batch]


training loss:  16.064556121826172 	 validation loss:  5.319646835327148

 HIDDEN_SIZE 200
--------------------------------
training loss:  16.12388038635254 	 validation loss:  5.293364524841309

 HIDDEN_SIZE 300
--------------------------------
training loss:  16.12946128845215 	 validation loss:  5.272335529327393

 HIDDEN_SIZE 400
--------------------------------
training loss:  16.076438903808594 	 validation loss:  5.234174728393555

 best params for fold 1:  400
test loss for fold 1:  4.4634504318237305
FOLD 2
--------------------------------

 HIDDEN_SIZE 100
--------------------------------
training loss:  16.10041618347168 	 validation loss:  5.290262222290039

 HIDDEN_SIZE 200
--------------------------------
training loss:  16.17787742614746 	 validation loss:  5.324727535247803

 HIDDEN_SIZE 300
--------------------------------
training loss:  16.160429000854492 	 validation loss:  5.298161029815674

 HIDDEN_SIZE 400
--------------------------------
training loss:  16.1050