In [1]:
import torch
from torch import nn, optim
import RVQE

In [2]:
torch.set_num_threads(2)

Our goal is to create a RNN or LSTM with roughly 1965 parameters, and compare it in the dna long sequence task implemented within RVQE.

In [3]:
dataset_t = lambda length: RVQE.datasets.all_datasets["dna"](0, num_shards=0, batch_size=16, sentence_length=length)

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
def to_one_hot(labels, num_classes=2**3):
    return torch.eye(num_classes)[labels]

In [5]:
SEEDS = [9120, 2783, 2057, 6549, 3201, 7063, 5243, 3102, 5303, 5819, 3693, 4884, 2231, 5514, 8850, 6861, 3106, 2378, 8697, 1821, 9480, 8483, 1633, 9678, 6596, 4509, 8618, 9765, 6346, 2969];
LENGTHS = [5, 10, 20, 50, 100, 200, 500, 1000];

# LSTM

In [6]:
class SimpleLSTM(nn.Module):
    """
        This is a very simplistic LSTM setup. We found a single layer performs
        much better than two layers with a smaller hidden size.
    """
    def __init__(self, io_size=2**3, hidden_size=40):
        super().__init__()
        
        self.rnn = nn.LSTM(input_size=io_size, hidden_size=hidden_size, num_layers=2, batch_first=True)
        self.lin = nn.Linear(hidden_size, io_size)
        
    def reset(self):
        self.lin.reset_parameters()
        for name, param in self.rnn.named_parameters():
            # give an orthogonal start
            if "weight_hh" in name:
                # stacked
                h = param.data.shape[1]
                for i in range(4):
                    torch.nn.init.orthogonal_(param.data[h*i : h*(i+1), :])
            elif "bias" in name:
                param.data.fill_(0)
            elif "weight_ih" in name:
                torch.nn.init.xavier_uniform_(param.data)
            else:
                raise Exception(f"cannot initialize {name}")
        
    @property
    def num_parameters(self):
        return count_parameters(self.rnn) + count_parameters(self.lin)
        
    def forward(self, sentence):
        rnn_out, _ = self.rnn(sentence)
        return self.lin(rnn_out)

In [7]:
SimpleLSTM().num_parameters

21448

In [20]:
results = {}

In [None]:
for length in LENGTHS:
    
    dataset = dataset_t(length)
    print(f"created LSTM with {SimpleLSTM().num_parameters} parameters")
    
    criterion = nn.CrossEntropyLoss()
    
    results[length] = results[length] if length in results else []
    
    for seed in SEEDS[:5]:
        if seed in [ s for s, _ in results[length] ]:
            continue
        
        torch.manual_seed(seed)
        model = SimpleLSTM()
        model.reset()
        optimizer = optim.Adam(model.parameters(), lr=0.05)   # this has been found to converge fastest
        
        for step in range(1, 16*1000): # cap amounts to the same number of samples seen as for qrnn
            sentence, target = dataset.next_batch(0, RVQE.data.TrainingStage.TRAIN)
            
            # transform sentence to one-hot as in the qrnn case
            sentence = to_one_hot(RVQE.data.targets_for_loss(sentence))            
            
            optimizer.zero_grad()
            out = model(sentence.float())
            
            # unlike the qrnn case, we use the entire output as loss
            # this gives the rnn an advantage!
            out = out.transpose(1, 2)
            target = RVQE.data.targets_for_loss(target)
            loss = criterion(out, target)
            
            loss.backward()
            optimizer.step()
            
            if torch.isnan(loss):
                print("nan")
                results[length].append([seed, 16*1000])
                break
            
            if loss < 0.0005:
                results[length].append([seed, step])
                print(f"length {length} converged after {step} steps.")
                break
            
            if step % 500 == 0:
                pass
                print(f"{step:06d} {loss:.2e}")
                
        else:
            print(f"length {length} did not converge after step steps.")
            results[length].append([seed, step])

created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
created LSTM with 21448 parameters
000500 5.76e-01
001000 5.88e-01
001500 2.75e-01
002000 3.17e-01
length 1000 converged after 2485 steps.
000500 1.34e-03
length 1000 converged after 664 steps.
000500 2.12e-02
length 1000 converged after 728 steps.
000500 7.04e-01
001000 1.52e-03
length 1000 converged after 1176 steps.
000500 1.23e+00
001000 1.41e+00
001500 1.25e+00
002000 1.37e+00
002500 1.31e+00
003000 1.25e+00
003500 1.21e+00
004000 1.22e+00


In [49]:
import pandas as pd
pd.DataFrame([ [key, seed, step, .0] for key in results for seed, step in results[key] ], columns=["sentence_length", "seed", "hparams/epoch", "hparams/validate_best"], index=None).to_csv("~/long-lstm.csv")

In [48]:
results.items()

dict_items([(5, [[9120, 52], [2783, 46], [2057, 33], [6549, 35], [3201, 42], [7063, 39], [5243, 44], [3102, 63], [5303, 39], [5819, 30], [3693, 36], [4884, 72], [2231, 44], [5514, 32], [8850, 35], [6861, 58], [3106, 43], [2378, 30], [8697, 44], [1821, 38], [9480, 40], [8483, 49], [1633, 35], [9678, 34], [6596, 34], [4509, 73], [8618, 42], [9765, 48], [6346, 56], [2969, 40]]), (10, [[9120, 147], [2783, 427], [2057, 183], [6549, 283], [3201, 205], [7063, 215], [5243, 123], [3102, 195], [5303, 72], [5819, 146], [3693, 296], [4884, 164], [2231, 211], [5514, 93], [8850, 196], [6861, 173], [3106, 323], [2378, 86], [8697, 105], [1821, 111], [9480, 214], [8483, 275], [1633, 102], [9678, 300], [6596, 145], [4509, 195], [8618, 188], [9765, 370], [6346, 206], [2969, 191]]), (20, [[9120, 742], [2783, 424], [2057, 338], [6549, 702], [3201, 292], [7063, 187], [5243, 411], [3102, 669], [5303, 366], [5819, 179], [3693, 244], [4884, 600], [2231, 365], [5514, 306], [8850, 317], [6861, 565], [3106, 317],