## Load data from file

In [1]:
import numpy as np

with open('rnn-challenge-data.npz', 'rb') as f:
    X = np.load(f)
    data_x = X['data_x']
    data_y = X['data_y']
    val_x = X['val_x']
    val_y = X['val_y']
    test_x = X['test_x']

# TRAINING DATA: INPUT (x) AND OUTPUT (y)
print(data_x.shape, data_x.dtype)
print(data_y.shape, data_y.dtype)

# VALIDATION DATA: INPUT (x) AND OUTPUT (y)
print(val_x.shape, val_x.dtype)
print(val_y.shape, val_y.dtype)

# TEST DATA: INPUT (x) ONLY
print(test_x.shape, test_x.dtype)

(400,) <U400
(400,) int64
(100,) <U1200
(100,) int64
(250,) <U2000


## Encode genome sequences

In [2]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import torch

def encode_genome_sequence(genome_sequence):
    # convert string to list of chars
    char_array =  np.array(list(genome_sequence))
    
    # encode characters using one-hot-encoding
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(char_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    
    # one hot encode
    onehot_encoder = OneHotEncoder(sparse=False)
    encoded=onehot_encoder.fit_transform(integer_encoded)
    return encoded
    
encoded_x = encode_genome_sequence(data_x[0])

## Encode labels

In [3]:
def encode_label(label,sequence_length):
    # one hot encoding
    encoded=np.zeros(5)
    encoded[label]=1
    #broadcast to length of the input sequence
    #label_sequence=np.broadcast_to(encoded.reshape(1,-1),[sequence_length,5])
    #encoded=label_sequence.copy()
    return encoded

In [4]:
## now it has the output format (sequence,feature)
test=encode_label(data_y[0],400)
test.shape

(5,)

## Custom Dataset for training

In [5]:
from torch.utils.data import Dataset

class CustomSequenceDataset(Dataset):
    def __init__(self, x_data, y_data, transform=None, target_transform=None):
        self.sequences = x_data
        self.labels = y_data
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence=self.sequences[idx]
        label=self.labels[idx]
        if self.transform:
            sequence = self.transform(sequence)
        if self.target_transform:
            label = self.target_transform(label,len(sequence))
        # make y as large as x 
        return sequence, label

## Create Dataloader for training

In [6]:
batch_size = 1

train_dataset=CustomSequenceDataset(data_x,data_y,transform = encode_genome_sequence, target_transform= encode_label)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size,shuffle=True)

## Check Data Format

In [7]:
# LSTM Layer expects the format (sequence_length,batch-size,feature_size) or (batch,sequence,feature) if param batch_first
x,y=next(iter(trainloader))
x.shape


torch.Size([1, 400, 4])

Ok, batch seems to be first, I'll set the parameter batch_first

## Define Model

In [8]:
from torch import nn
import torch.nn.functional as F

class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()

        hidden_size=4
        ## gets One-Hot-encoded genome element and returns the hidden values
        self.lstm = nn.LSTM(input_size=4, hidden_size=hidden_size,batch_first=True)

        # Classifier to make prediction from hidden layer
        self.classify = nn.Linear(hidden_size, 5)

    def forward(self, sequence):
        lstm_out, _ = self.lstm(sequence.float())
        #class_space = self.classify(lstm_out.view(len(sequence), -1))
        class_space = self.classify(lstm_out)
        logit = F.log_softmax(class_space, dim=1)
        return logit
    
lstm = LSTM()

## Define Loss function and optimizer

In [9]:
import torch.optim as optim

loss_fn = nn.NLLLoss() # Negative Log Likelihood because classification
optimizer = optim.Adam(lstm.parameters(), lr=0.001)

## Training 

In [10]:
overall_epochs=0

In [65]:
epochs = 3
# print every 5th sequence
print_running_loss = 5

for epoch in range(epochs):  # loop over the dataset multiple times
    overall_epochs+=1
    running_loss = 0.0
    
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        sequence, label = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        label_score = lstm(sequence)
        loss = loss_fn(label_score, label.long())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_running_loss == print_running_loss-1:    # print 10 times during an epoch
            print('[epoch %d, iterations: %5d] loss: %.3f' %
                  (overall_epochs + 1, i + 1, running_loss / print_running_loss))
            running_loss = 0.0
print('Finished Training')

[epoch 7, iterations:     5] loss: 0.640
[epoch 7, iterations:    10] loss: 0.666
[epoch 7, iterations:    15] loss: 0.586
[epoch 7, iterations:    20] loss: 0.569
[epoch 7, iterations:    25] loss: 0.648
[epoch 7, iterations:    30] loss: 0.574
[epoch 7, iterations:    35] loss: 0.585
[epoch 7, iterations:    40] loss: 0.623
[epoch 7, iterations:    45] loss: 0.628
[epoch 7, iterations:    50] loss: 0.563
[epoch 7, iterations:    55] loss: 0.581
[epoch 7, iterations:    60] loss: 0.578
[epoch 7, iterations:    65] loss: 0.628
[epoch 7, iterations:    70] loss: 0.588
[epoch 7, iterations:    75] loss: 0.577
[epoch 7, iterations:    80] loss: 0.619
[epoch 7, iterations:    85] loss: 0.561
[epoch 7, iterations:    90] loss: 0.594
[epoch 7, iterations:    95] loss: 0.563
[epoch 7, iterations:   100] loss: 0.568
[epoch 7, iterations:   105] loss: 0.573
[epoch 7, iterations:   110] loss: 0.586
[epoch 7, iterations:   115] loss: 0.575
[epoch 7, iterations:   120] loss: 0.558
[epoch 7, iterat

[epoch 9, iterations:   205] loss: 0.562
[epoch 9, iterations:   210] loss: 0.525
[epoch 9, iterations:   215] loss: 0.530
[epoch 9, iterations:   220] loss: 0.540
[epoch 9, iterations:   225] loss: 0.581
[epoch 9, iterations:   230] loss: 0.557
[epoch 9, iterations:   235] loss: 0.538
[epoch 9, iterations:   240] loss: 0.540
[epoch 9, iterations:   245] loss: 0.567
[epoch 9, iterations:   250] loss: 0.541
[epoch 9, iterations:   255] loss: 0.563
[epoch 9, iterations:   260] loss: 0.533
[epoch 9, iterations:   265] loss: 0.543
[epoch 9, iterations:   270] loss: 0.577
[epoch 9, iterations:   275] loss: 0.546
[epoch 9, iterations:   280] loss: 0.557
[epoch 9, iterations:   285] loss: 0.527
[epoch 9, iterations:   290] loss: 0.544
[epoch 9, iterations:   295] loss: 0.525
[epoch 9, iterations:   300] loss: 0.542
[epoch 9, iterations:   305] loss: 0.532
[epoch 9, iterations:   310] loss: 0.534
[epoch 9, iterations:   315] loss: 0.536
[epoch 9, iterations:   320] loss: 0.553
[epoch 9, iterat

## Training Accuracy

In [43]:
from numpy import argmax

def decode_label(encoded_label):
    return argmax(encoded_label)

def decode_labels(encoded_labels):
    return argmax(encoded_labels[0],axis=1)

In [68]:
predictions=[]
correct_or_wrong=[]

with torch.no_grad():
    for sequence,label in trainloader:
        pred_label = lstm(sequence)
        prediction= decode_label(pred_label)
        correct_or_wrong.append(prediction == decode_label(label))

In [74]:
accuracy=sum(correct_or_wrong)/len(correct_or_wrong)
print(accuracy)

tensor(0.1850)


In [89]:
np.exp2(pred_label[0][4])

tensor([0.0081, 0.0077, 0.0059, 0.0062, 0.0070])

## Validate using Validation data

In [66]:
batch_size = 1
test_dataset=CustomSequenceDataset(val_x,val_y,transform = encode_genome_sequence, target_transform= encode_label)
testloader = torch.utils.data.DataLoader(train_dataset, batch_size)

predictions=[]
correct_or_wrong=[]

with torch.no_grad():
    for sequence,label in testloader:
        pred_label = lstm(sequence)
        prediction= decode_label(pred_label)
        correct_or_wrong.append(prediction == decode_label(label))

In [67]:
accuracy=sum(correct_or_wrong)/len(correct_or_wrong)
print(accuracy)

tensor(0.1850)


In [None]:
# MAKE SURE THAT YOU HAVE THE RIGHT FORMAT
assert prediction_test.ndim == 1
assert prediction_test.shape[0] == 250

# AND SAVE EXACTLY AS SHOWN BELOW
np.save('prediction.npy', prediction.astype(int))

# MAKE SURE THAT THE FILE HAS THE CORRECT FORMAT
def validate_prediction_format():
    loaded = np.load('prediction.npy')
    assert loaded.shape == (250, )
    assert loaded.dtype == int
    assert (loaded <= 4).all()
    assert (loaded >= 0).all()
validate_prediction_format()