## Load data from file

In [2]:
import numpy as np

with open('rnn-challenge-data.npz', 'rb') as f:
    X = np.load(f)
    data_x = X['data_x']
    data_y = X['data_y']
    val_x = X['val_x']
    val_y = X['val_y']
    test_x = X['test_x']

# TRAINING DATA: INPUT (x) AND OUTPUT (y)
print(data_x.shape, data_x.dtype)
print(data_y.shape, data_y.dtype)

# VALIDATION DATA: INPUT (x) AND OUTPUT (y)
print(val_x.shape, val_x.dtype)
print(val_y.shape, val_y.dtype)

# TEST DATA: INPUT (x) ONLY
print(test_x.shape, test_x.dtype)

(400,) <U400
(400,) int64
(100,) <U1200
(100,) int64
(250,) <U2000


## Encode genome sequences

In [3]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import torch

def encode_genome_sequence(genome_sequence):
    # convert string to list of chars
    char_array =  np.array(list(genome_sequence))
    
    # encode characters using one-hot-encoding
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(char_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    
    # one hot encode
    onehot_encoder = OneHotEncoder(sparse=False)
    encoded=onehot_encoder.fit_transform(integer_encoded)
    return encoded
    
encoded_x = encode_genome_sequence(data_x[0])

## Custom Dataset for training

In [4]:
from torch.utils.data import Dataset

class CustomSequenceDataset(Dataset):
    def __init__(self, x_data, y_data, transform=None, target_transform=None):
        self.sequences = x_data
        self.labels = y_data
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence=self.sequences[idx]
        label=self.labels[idx]
        if self.transform:
            sequence = self.transform(sequence)
        if self.target_transform:
            label = self.target_transform(label,len(sequence))
        # make y as large as x 
        return sequence, label

## Create Dataloader for training

In [5]:
batch_size = 16

train_dataset=CustomSequenceDataset(data_x,data_y,transform = encode_genome_sequence)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size,shuffle=True)

## Create Dataloader for validation

In [6]:
test_dataset=CustomSequenceDataset(val_x,val_y,transform = encode_genome_sequence)
testloader = torch.utils.data.DataLoader(test_dataset, 1)

## Check Data Format

In [7]:
# LSTM Layer expects the format (sequence_length,batch-size,feature_size) or (batch,sequence,feature) if param batch_first
x,y=next(iter(trainloader))
x.shape


torch.Size([16, 400, 4])

Ok, batch seems to be first, I'll set the parameter batch_first

## Define Model

In [8]:
from torch import nn
import torch.nn.functional as F

class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()

        self.hidden_size=4
        self.n_layers=1
        ## gets One-Hot-encoded genome element and returns the hidden values
        self.lstm = nn.LSTM(input_size=4, hidden_size=self.hidden_size,batch_first=True)

        # Classifier to make prediction from hidden layer
        self.classify = nn.Linear(self.hidden_size, 5)

    def forward(self, sequence, hidden_states):
        lstm_out, _ = self.lstm(sequence.float(),hidden_states)
        class_space = self.classify(lstm_out)
        logit = F.log_softmax(class_space, dim=1)
        return logit
    
    def init_hidden(self,batch_size):
        """ Set hidden states (h,c) to zero. Can be used for initialization """
        weight = next(self.parameters()).data
        h = weight.new(self.n_layers, batch_size, self.hidden_size).zero_()
        c= weight.new(self.n_layers, batch_size, self.hidden_size).zero_()
        return h,c
    
lstm = LSTM()

## Optional: Load weights from file

In [14]:
lstm.load_state_dict(torch.load('models/weights-e40.pth'))
lstm.eval()
overall_epochs=40

## Training and Test Accuracy

In [9]:
from numpy import argmax

def decode_label(encoded_label):
    return argmax(encoded_label)

def get_accuracy(dataloader):
    predictions=[]
    correct_or_wrong=[]

    with torch.no_grad():
        for sequence,label in dataloader:
            hidden_states=lstm.init_hidden(batch_size)
            pred_label = lstm(sequence,hidden_states)
            last_label = pred_label[0][len(pred_label[0])-1]
            prediction= decode_label(last_label)
            predictions.append(prediction)
            correct_or_wrong.append(prediction == label)
    return sum(correct_or_wrong)/len(correct_or_wrong)

## Validation Accuracy

In [10]:
def validate():
    return(get_accuracy(testloader).item())

## Define Loss function and optimizer

In [11]:
import torch.optim as optim

loss_fn = nn.NLLLoss() # Negative Log Likelihood because classification
optimizer = optim.Adam(lstm.parameters(), lr=0.001)

## Training 

In [12]:
overall_epochs=0

In [None]:
epochs = 20
# print every 5th sequence
print_running_loss = 5

for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        sequence, label = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # set hidden states to zero after each sequence
        hidden_states = lstm.init_hidden(batch_size)
        
        # forward + backward + optimize
        label_scores = lstm(sequence,hidden_states)
        last_label = torch.reshape(label_scores[0][len(label_scores[0])-1],[1,5]) # only the last element of sequence!
        loss = loss_fn(last_label, label.long())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_running_loss == print_running_loss-1:    # print 10 times during an epoch
            print('[epoch %d, iterations: %5d] loss: %.3f' %
                  (overall_epochs, i + 1, running_loss / print_running_loss))
            running_loss = 0.0
            
    ## save model every epoch
    val_accuracy= validate()
    file_name="weights-a"+str(round(val_accuracy,4))+"-e"+str(overall_epochs)+".pth"
    torch.save(lstm.state_dict(), "models/"+file_name)
    
    overall_epochs+=1

print('Finished Training')

[epoch 40, iterations:     5] loss: 5.217
[epoch 40, iterations:    10] loss: 5.719
[epoch 40, iterations:    15] loss: 5.525
[epoch 40, iterations:    20] loss: 5.508
[epoch 40, iterations:    25] loss: 6.006
[epoch 40, iterations:    30] loss: 5.803
[epoch 40, iterations:    35] loss: 5.804
[epoch 40, iterations:    40] loss: 5.935
[epoch 40, iterations:    45] loss: 5.168
[epoch 40, iterations:    50] loss: 6.285
[epoch 40, iterations:    55] loss: 5.372
[epoch 40, iterations:    60] loss: 5.410
[epoch 40, iterations:    65] loss: 5.959
[epoch 40, iterations:    70] loss: 5.962
[epoch 40, iterations:    75] loss: 5.702
[epoch 40, iterations:    80] loss: 5.786
[epoch 40, iterations:    85] loss: 5.414
[epoch 40, iterations:    90] loss: 5.589
[epoch 40, iterations:    95] loss: 5.978
[epoch 40, iterations:   100] loss: 5.456
[epoch 40, iterations:   105] loss: 5.734
[epoch 40, iterations:   110] loss: 5.885
[epoch 40, iterations:   115] loss: 5.984
[epoch 40, iterations:   120] loss

[epoch 42, iterations:   185] loss: 4.032
[epoch 42, iterations:   190] loss: 4.074
[epoch 42, iterations:   195] loss: 4.391
[epoch 42, iterations:   200] loss: 4.281
[epoch 42, iterations:   205] loss: 4.528
[epoch 42, iterations:   210] loss: 4.203
[epoch 42, iterations:   215] loss: 4.396
[epoch 42, iterations:   220] loss: 4.956
[epoch 42, iterations:   225] loss: 5.132
[epoch 42, iterations:   230] loss: 5.124
[epoch 42, iterations:   235] loss: 4.941
[epoch 42, iterations:   240] loss: 4.981
[epoch 42, iterations:   245] loss: 5.268
[epoch 42, iterations:   250] loss: 5.343
[epoch 42, iterations:   255] loss: 5.181
[epoch 42, iterations:   260] loss: 5.023
[epoch 42, iterations:   265] loss: 5.401
[epoch 42, iterations:   270] loss: 4.930
[epoch 42, iterations:   275] loss: 4.768
[epoch 42, iterations:   280] loss: 5.156
[epoch 42, iterations:   285] loss: 4.943
[epoch 42, iterations:   290] loss: 4.994
[epoch 42, iterations:   295] loss: 4.637
[epoch 42, iterations:   300] loss

[epoch 44, iterations:   365] loss: 4.670
[epoch 44, iterations:   370] loss: 4.654
[epoch 44, iterations:   375] loss: 4.412
[epoch 44, iterations:   380] loss: 4.020
[epoch 44, iterations:   385] loss: 4.855
[epoch 44, iterations:   390] loss: 4.911
[epoch 44, iterations:   395] loss: 4.824
[epoch 44, iterations:   400] loss: 4.873
[epoch 45, iterations:     5] loss: 4.697
[epoch 45, iterations:    10] loss: 4.932
[epoch 45, iterations:    15] loss: 4.841
[epoch 45, iterations:    20] loss: 5.066
[epoch 45, iterations:    25] loss: 4.281
[epoch 45, iterations:    30] loss: 4.367
[epoch 45, iterations:    35] loss: 4.399
[epoch 45, iterations:    40] loss: 4.878
[epoch 45, iterations:    45] loss: 4.593
[epoch 45, iterations:    50] loss: 4.707
[epoch 45, iterations:    55] loss: 4.794
[epoch 45, iterations:    60] loss: 4.551
[epoch 45, iterations:    65] loss: 4.752
[epoch 45, iterations:    70] loss: 4.071
[epoch 45, iterations:    75] loss: 4.362
[epoch 45, iterations:    80] loss

[epoch 47, iterations:   145] loss: 4.665
[epoch 47, iterations:   150] loss: 4.483
[epoch 47, iterations:   155] loss: 4.740
[epoch 47, iterations:   160] loss: 4.113
[epoch 47, iterations:   165] loss: 3.995
[epoch 47, iterations:   170] loss: 4.790
[epoch 47, iterations:   175] loss: 5.059
[epoch 47, iterations:   180] loss: 5.391
[epoch 47, iterations:   185] loss: 4.657
[epoch 47, iterations:   190] loss: 4.961
[epoch 47, iterations:   195] loss: 4.888
[epoch 47, iterations:   200] loss: 5.256
[epoch 47, iterations:   205] loss: 5.142
[epoch 47, iterations:   210] loss: 4.665
[epoch 47, iterations:   215] loss: 4.975
[epoch 47, iterations:   220] loss: 4.668
[epoch 47, iterations:   225] loss: 4.481
[epoch 47, iterations:   230] loss: 4.199
[epoch 47, iterations:   235] loss: 3.988
[epoch 47, iterations:   240] loss: 4.070
[epoch 47, iterations:   245] loss: 5.212
[epoch 47, iterations:   250] loss: 3.702
[epoch 47, iterations:   255] loss: 4.010
[epoch 47, iterations:   260] loss

[epoch 49, iterations:   325] loss: 5.705
[epoch 49, iterations:   330] loss: 5.870
[epoch 49, iterations:   335] loss: 5.152
[epoch 49, iterations:   340] loss: 5.720
[epoch 49, iterations:   345] loss: 5.838
[epoch 49, iterations:   350] loss: 5.790
[epoch 49, iterations:   355] loss: 5.505
[epoch 49, iterations:   360] loss: 5.211
[epoch 49, iterations:   365] loss: 5.571
[epoch 49, iterations:   370] loss: 5.193
[epoch 49, iterations:   375] loss: 5.330
[epoch 49, iterations:   380] loss: 5.647
[epoch 49, iterations:   385] loss: 5.308
[epoch 49, iterations:   390] loss: 5.661
[epoch 49, iterations:   395] loss: 5.736
[epoch 49, iterations:   400] loss: 5.455
[epoch 50, iterations:     5] loss: 5.224
[epoch 50, iterations:    10] loss: 5.596
[epoch 50, iterations:    15] loss: 5.391
[epoch 50, iterations:    20] loss: 5.520
[epoch 50, iterations:    25] loss: 6.159
[epoch 50, iterations:    30] loss: 5.376
[epoch 50, iterations:    35] loss: 5.868
[epoch 50, iterations:    40] loss

[epoch 52, iterations:   105] loss: 5.360
[epoch 52, iterations:   110] loss: 5.365
[epoch 52, iterations:   115] loss: 5.158
[epoch 52, iterations:   120] loss: 5.942
[epoch 52, iterations:   125] loss: 5.582
[epoch 52, iterations:   130] loss: 5.992
[epoch 52, iterations:   135] loss: 5.396
[epoch 52, iterations:   140] loss: 5.073
[epoch 52, iterations:   145] loss: 5.979
[epoch 52, iterations:   150] loss: 5.612
[epoch 52, iterations:   155] loss: 5.532
[epoch 52, iterations:   160] loss: 5.484
[epoch 52, iterations:   165] loss: 5.180
[epoch 52, iterations:   170] loss: 5.613
[epoch 52, iterations:   175] loss: 5.234
[epoch 52, iterations:   180] loss: 5.892
[epoch 52, iterations:   185] loss: 5.520
[epoch 52, iterations:   190] loss: 5.402
[epoch 52, iterations:   195] loss: 5.406
[epoch 52, iterations:   200] loss: 5.827
[epoch 52, iterations:   205] loss: 5.728
[epoch 52, iterations:   210] loss: 5.247
[epoch 52, iterations:   215] loss: 5.439
[epoch 52, iterations:   220] loss

[epoch 54, iterations:   285] loss: 5.568
[epoch 54, iterations:   290] loss: 5.276
[epoch 54, iterations:   295] loss: 5.550
[epoch 54, iterations:   300] loss: 5.262
[epoch 54, iterations:   305] loss: 5.304
[epoch 54, iterations:   310] loss: 5.365
[epoch 54, iterations:   315] loss: 5.153
[epoch 54, iterations:   320] loss: 5.789
[epoch 54, iterations:   325] loss: 5.071
[epoch 54, iterations:   330] loss: 5.139
[epoch 54, iterations:   335] loss: 4.955
[epoch 54, iterations:   340] loss: 5.649
[epoch 54, iterations:   345] loss: 5.328
[epoch 54, iterations:   350] loss: 5.863
[epoch 54, iterations:   355] loss: 5.153
[epoch 54, iterations:   360] loss: 5.408
[epoch 54, iterations:   365] loss: 5.692


## Training and Test Accuracy

In [9]:
from numpy import argmax

def decode_label(encoded_label):
    return argmax(encoded_label)

def get_accuracy(dataloader):
    predictions=[]
    correct_or_wrong=[]

    with torch.no_grad():
        for sequence,label in dataloader:
            hidden_states=lstm.init_hidden(batch_size)
            pred_label = lstm(sequence,hidden_states)
            last_label = pred_label[0][len(pred_label[0])-1]
            prediction= decode_label(last_label)
            predictions.append(prediction)
            correct_or_wrong.append(prediction == label)
    return sum(correct_or_wrong)/len(correct_or_wrong)

## Training Accuracy

In [10]:
print(get_accuracy(trainloader))

tensor([0.4150])


## Validation Accuracy

In [40]:
test_dataset=CustomSequenceDataset(val_x,val_y,transform = encode_genome_sequence)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size)
print(get_accuracy(testloader))

tensor([0.4100])


In [None]:
# MAKE SURE THAT YOU HAVE THE RIGHT FORMAT
assert prediction_test.ndim == 1
assert prediction_test.shape[0] == 250

# AND SAVE EXACTLY AS SHOWN BELOW
np.save('prediction.npy', prediction.astype(int))

# MAKE SURE THAT THE FILE HAS THE CORRECT FORMAT
def validate_prediction_format():
    loaded = np.load('prediction.npy')
    assert loaded.shape == (250, )
    assert loaded.dtype == int
    assert (loaded <= 4).all()
    assert (loaded >= 0).all()
validate_prediction_format()