## Load data from file

In [2]:
import numpy as np

with open('rnn-challenge-data.npz', 'rb') as f:
    X = np.load(f)
    data_x = X['data_x']
    data_y = X['data_y']
    val_x = X['val_x']
    val_y = X['val_y']
    test_x = X['test_x']

# TRAINING DATA: INPUT (x) AND OUTPUT (y)
print(data_x.shape, data_x.dtype)
print(data_y.shape, data_y.dtype)

# VALIDATION DATA: INPUT (x) AND OUTPUT (y)
print(val_x.shape, val_x.dtype)
print(val_y.shape, val_y.dtype)

# TEST DATA: INPUT (x) ONLY
print(test_x.shape, test_x.dtype)

(400,) <U400
(400,) int64
(100,) <U1200
(100,) int64
(250,) <U2000


## Encode genome sequences

In [3]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import torch

def encode_genome_sequence(genome_sequence):
    # convert string to list of chars
    char_array =  np.array(list(genome_sequence))
    
    # encode characters using one-hot-encoding
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(char_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    
    # one hot encode
    onehot_encoder = OneHotEncoder(sparse=False)
    encoded=onehot_encoder.fit_transform(integer_encoded)
    return encoded
    
encoded_x = encode_genome_sequence(data_x[0])

## Custom Dataset for training

In [4]:
from torch.utils.data import Dataset

class CustomSequenceDataset(Dataset):
    def __init__(self, x_data, y_data, transform=None, target_transform=None):
        self.sequences = x_data
        self.labels = y_data
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence=self.sequences[idx]
        label=self.labels[idx]
        if self.transform:
            sequence = self.transform(sequence)
        if self.target_transform:
            label = self.target_transform(label,len(sequence))
        # make y as large as x 
        return sequence, label

## Create Dataloader for training

In [5]:
batch_size = 4

train_dataset=CustomSequenceDataset(data_x,data_y,transform = encode_genome_sequence)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size,shuffle=True)

## Create Dataloader for validation

In [6]:
test_dataset=CustomSequenceDataset(val_x,val_y,transform = encode_genome_sequence)
testloader = torch.utils.data.DataLoader(test_dataset, 1)
train_testloader = torch.utils.data.DataLoader(train_dataset,1)

## Check Data Format

In [7]:
# LSTM Layer expects the format (sequence_length,batch-size,feature_size) or (batch,sequence,feature) if param batch_first
x,y=next(iter(trainloader))
x.shape


torch.Size([4, 400, 4])

Ok, batch seems to be first, I'll set the parameter batch_first

## Define Model

In [7]:
from torch import nn
import torch.nn.functional as F

class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()

        self.hidden_size=8
        self.n_layers=2
        ## gets One-Hot-encoded genome element and returns the hidden values
        self.lstm = nn.LSTM(input_size=4, hidden_size=self.hidden_size,batch_first=True)

        # Classifier to make prediction from hidden layer
        self.classify = nn.Linear(self.hidden_size, 5)

    def forward(self, sequence, hidden_states):
        lstm_out, _ = self.lstm(sequence.float(),hidden_states)
        class_space = self.classify(lstm_out)
        logit = F.log_softmax(class_space, dim=1)
        return logit
    
    def init_hidden(self,batch_size):
        """ Set hidden states (h,c) to zero. Can be used for initialization """
        weight = next(self.parameters()).data
        h = weight.new(self.n_layers, batch_size, self.hidden_size).zero_()
        c= weight.new(self.n_layers, batch_size, self.hidden_size).zero_()
        return h,c
    
lstm = LSTM()

## Optional: Load weights from file

In [8]:
lstm.load_state_dict(torch.load('models/batch-size1/weights-a0.47-e27.pth'))
lstm.eval()
overall_epochs=40

FileNotFoundError: [Errno 2] No such file or directory: 'models/batch-size1/weights-a0.47-e27.pth'

## Training and Test Accuracy

In [9]:
from numpy import argmax

def decode_label(encoded_label):
    return argmax(encoded_label)

# Geht nur mit Batch-Size=1
def get_accuracy(dataloader,batch_size=1):
    predictions=[]
    correct_or_wrong=[]

    with torch.no_grad():
        for sequence,label in dataloader:
            hidden_states=lstm.init_hidden(batch_size)
            pred_label = lstm(sequence,hidden_states)
            last_label = pred_label[0][len(pred_label[0])-1]
            prediction= decode_label(last_label)
            predictions.append(prediction)
            correct_or_wrong.append(prediction == label)
    return sum(correct_or_wrong)/len(correct_or_wrong)

## Validation Accuracy

In [10]:
def validate():
    return(get_accuracy(testloader).item())

## Define Loss function and optimizer

In [11]:
import torch.optim as optim

loss_fn = nn.NLLLoss() # Negative Log Likelihood because classification
optimizer = optim.Adam(lstm.parameters(), lr=0.001)

# Save hyperparameter values

In [33]:
# create folder
import os 
from datetime import datetime
folder_path = "models/"+datetime.now().isoformat()+'/'
os.mkdir(folder_path)

# dump hyperparameters as json
#import json 
#with open(folder_path+'parameters.json','w') as file:
#    json.dump({batch_size,class(loss_fn)},file)

SyntaxError: invalid syntax (<ipython-input-33-c5c3f50f817f>, line 10)

## Training 

In [12]:
overall_epochs=0

In [16]:
epochs = 20
# print every 5th sequence
print_running_loss = 5

for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    
    for i, batch in enumerate(trainloader, 0):
        # get the inputs; batch is a list of [inputs, labels]
        sequences, labels = batch

        # zero the parameter gradients
        optimizer.zero_grad()

        # set hidden states to zero after each sequence
        hidden_states = lstm.init_hidden(batch_size)
        
        # forward + backward + optimize
        label_scores = lstm(sequences,hidden_states)
        last_label = label_scores[:,len(label_scores[0])-1] # only the last element of each sequence!
        loss = loss_fn(last_label, labels.long())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_running_loss == print_running_loss-1:    # print 10 times during an epoch
            print('[epoch %d, iterations: %5d] loss: %.3f' %
                  (overall_epochs, i + 1, running_loss / print_running_loss))
            running_loss = 0.0
            
    ## save model every epoch
    val_accuracy= validate()
    file_name="weights-a"+str(round(val_accuracy,4))+"-e"+str(overall_epochs)+".pth"
    torch.save(lstm.state_dict(), folder_path+str(batch_size)+"/"+file_name)
    
    overall_epochs+=1

print('Finished Training')

[epoch 20, iterations:     5] loss: 5.592
[epoch 20, iterations:    10] loss: 5.642
[epoch 20, iterations:    15] loss: 5.515
[epoch 20, iterations:    20] loss: 5.402
[epoch 20, iterations:    25] loss: 5.368
[epoch 20, iterations:    30] loss: 5.581
[epoch 20, iterations:    35] loss: 5.332
[epoch 20, iterations:    40] loss: 5.464
[epoch 20, iterations:    45] loss: 5.395
[epoch 20, iterations:    50] loss: 5.412
[epoch 20, iterations:    55] loss: 5.513
[epoch 20, iterations:    60] loss: 5.476
[epoch 20, iterations:    65] loss: 5.302
[epoch 20, iterations:    70] loss: 5.504
[epoch 20, iterations:    75] loss: 5.477
[epoch 20, iterations:    80] loss: 5.293
[epoch 20, iterations:    85] loss: 5.462
[epoch 20, iterations:    90] loss: 5.488
[epoch 20, iterations:    95] loss: 5.351
[epoch 20, iterations:   100] loss: 5.479
[epoch 21, iterations:     5] loss: 5.269
[epoch 21, iterations:    10] loss: 5.402
[epoch 21, iterations:    15] loss: 5.420
[epoch 21, iterations:    20] loss

[epoch 29, iterations:    85] loss: 5.409
[epoch 29, iterations:    90] loss: 5.278
[epoch 29, iterations:    95] loss: 5.493
[epoch 29, iterations:   100] loss: 5.512
[epoch 30, iterations:     5] loss: 5.533
[epoch 30, iterations:    10] loss: 5.462
[epoch 30, iterations:    15] loss: 5.360
[epoch 30, iterations:    20] loss: 5.447
[epoch 30, iterations:    25] loss: 5.441
[epoch 30, iterations:    30] loss: 5.358
[epoch 30, iterations:    35] loss: 5.472
[epoch 30, iterations:    40] loss: 5.396
[epoch 30, iterations:    45] loss: 5.416
[epoch 30, iterations:    50] loss: 5.336
[epoch 30, iterations:    55] loss: 5.295
[epoch 30, iterations:    60] loss: 5.340
[epoch 30, iterations:    65] loss: 5.281
[epoch 30, iterations:    70] loss: 5.386
[epoch 30, iterations:    75] loss: 5.248
[epoch 30, iterations:    80] loss: 5.529
[epoch 30, iterations:    85] loss: 5.333
[epoch 30, iterations:    90] loss: 5.286
[epoch 30, iterations:    95] loss: 5.298
[epoch 30, iterations:   100] loss

[epoch 39, iterations:    65] loss: 5.234
[epoch 39, iterations:    70] loss: 5.372
[epoch 39, iterations:    75] loss: 5.403
[epoch 39, iterations:    80] loss: 5.180
[epoch 39, iterations:    85] loss: 5.466
[epoch 39, iterations:    90] loss: 5.142
[epoch 39, iterations:    95] loss: 5.295
[epoch 39, iterations:   100] loss: 5.370
Finished Training


## Training and Test Accuracy

## Training Accuracy

In [11]:
print(get_accuracy(train_testloader))

tensor([0.4375])


## Validation Accuracy

In [12]:
print(get_accuracy(testloader))

tensor([0.4700])


In [None]:
# MAKE SURE THAT YOU HAVE THE RIGHT FORMAT
assert prediction_test.ndim == 1
assert prediction_test.shape[0] == 250

# AND SAVE EXACTLY AS SHOWN BELOW
np.save('prediction.npy', prediction.astype(int))

# MAKE SURE THAT THE FILE HAS THE CORRECT FORMAT
def validate_prediction_format():
    loaded = np.load('prediction.npy')
    assert loaded.shape == (250, )
    assert loaded.dtype == int
    assert (loaded <= 4).all()
    assert (loaded >= 0).all()
validate_prediction_format()