### Задание

1. Сгенерировать последовательности, которые бы состояли из цифр (от 0 до 9) и задавались следующим образом:
x - последовательность цифр   
y1 = x1, y(i) = x(i) + x(1). Если y(i) >= 10, то y(i) = y(i) - 10   
2. научить модель предсказывать y(i) по x(i)
3. попробовать RNN, LSTM, GRU

In [1]:
import torch
from torch import nn
import numpy as np
import re
import random
import tqdm
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def generate_xy(sequence_len=100, batch_size=1, torch_type = torch.long):
    ''' 
    Создание датасета по алгоритму: 
    y1 = x1, y(i) = x(i) + x(1). Если y(i) >= 10, то y(i) = y(i) - 10   
    '''
    # torch_type = torch.long
    assert batch_size >= 1, 'Wrong batch size'
    if sequence_len is None:
        sequence_len = np.random.randint(10, 100)
        
    X = torch.zeros((batch_size, sequence_len), dtype=torch_type, device=device)
    y = torch.zeros((batch_size, sequence_len), dtype=torch_type, device=device)

    for i in range(batch_size):
        X_line = np.random.randint(0, 9, size=[sequence_len])
        y_line = X_line + X_line[0]
        y_line = np.vectorize(lambda x: x if x < 10 else x - 10)(y_line)
        y_line[0] = X_line[0]
        X[i] = torch.tensor(X_line, dtype=torch_type, device=device)
        y[i] = torch.tensor(y_line, dtype=torch_type, device=device)
    return X, y

In [4]:
X, y = generate_xy(sequence_len=10, batch_size=10, torch_type=torch.long)

In [5]:
class NeuralNetwork(nn.Module):
    def _print(self, *text):
        if self.debug:
            print(*text)
    
    def __init__(self, rnnClass, input_size, embedding_size, num_hiddens, num_classes, debug=False):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = nn.Linear(num_hiddens, num_classes)
        self.debug = debug
        
    def forward(self, X):
        out = self.embedding(X)
        if type(self.hidden) == nn.LSTM:
            _, (state, _) = self.hidden(out)
        else:
            _, state = self.hidden(out)
        self._print(f'model.forward| state : {state.shape}, state[0] : {state[0].shape}')
        predictions = self.output(state[0])
        self._print(f'model.forward| predictions : {predictions.shape}' )
        return predictions    

In [6]:
model = NeuralNetwork(rnnClass=nn.LSTM, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10, debug=True
                     )

model = model.cuda() if torch.cuda.is_available() else model.cpu()

In [7]:
torch.tensor([1,2,3,4,3,2,1]).argmax()

tensor(3)

In [8]:
X, y = generate_xy(sequence_len=10, batch_size=1)
print(X.shape, X)
print(y.shape, y)
answers = model.forward(X)
print('answers : ', answers.shape, np.argmax(answers.detach().numpy()))

torch.Size([1, 10]) tensor([[6, 4, 8, 8, 8, 0, 6, 2, 5, 6]])
torch.Size([1, 10]) tensor([[6, 0, 4, 4, 4, 6, 2, 8, 1, 2]])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
answers :  torch.Size([1, 10]) 6


In [9]:
def generate_sequence(model, sequence_len=10, debug=False):
    X, y = generate_xy(sequence_len=sequence_len, batch_size=1)    
    if debug:
        print('X: ', X)
        print('y: ', y)
    
    preds = torch.zeros( (1, sequence_len), dtype=int, device=device)
    for i in range(sequence_len):
        next_digit = model(X[:, :(i + 1)])
        preds[0, i] = next_digit.argmax()
        if debug:
            print(f'======= i = {i}')
            print('X[:, :(i + 1)] : ', X[:, :(i + 1)])
            print('next_digit : ', next_digit, next_digit.argmax())
            print('next_digit softmax: ', torch.softmax(next_digit, 1), torch.softmax(next_digit, 1).argmax())
            print('pred : ', preds)

    print(y, preds)

In [10]:
generate_sequence(model, debug=False)

model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torc

In [11]:
# test learning
X, y = generate_xy(sequence_len=5, batch_size=1) 
print(X, y)
for i in range( 5 ):
    print(X[:, :(i+1)], y[:, i])

tensor([[5, 7, 7, 7, 4]]) tensor([[5, 2, 2, 2, 9]])
tensor([[5]]) tensor([5])
tensor([[5, 7]]) tensor([2])
tensor([[5, 7, 7]]) tensor([2])
tensor([[5, 7, 7, 7]]) tensor([2])
tensor([[5, 7, 7, 7, 4]]) tensor([9])


In [13]:
def train(model, criterion, optimizer, epochs=300, sequence_len=100, batch_size=100):
    for ep in range(epochs + 1):
        start = time.time()
        train_loss = 0.
        train_passed = 0

        model.train()
        X, y = generate_xy(sequence_len=sequence_len, batch_size=batch_size)
        for i in range( sequence_len ):
            optimizer.zero_grad()
            model.zero_grad()
            answers = model.forward(X[:, :(i + 1)])
            loss = criterion(answers, y[:, i])        
            train_loss += loss.item()

            loss.backward()
            optimizer.step()
            train_passed += 1

        if ep % 50 == 0 and ep != 0:
            print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))
            model.eval()
            generate_sequence(model)     

In [17]:
model = NeuralNetwork(rnnClass=nn.RNN, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10, debug=False)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.043, Train loss: 2.257
tensor([[3, 1, 0, 8, 6, 0, 6, 8, 3, 9]]) tensor([[3, 8, 8, 5, 3, 8, 3, 5, 0, 6]])
Epoch 100. Time: 0.047, Train loss: 2.065
tensor([[2, 4, 2, 8, 3, 2, 6, 6, 2, 7]]) tensor([[2, 2, 2, 6, 9, 8, 5, 5, 8, 9]])
Epoch 150. Time: 0.032, Train loss: 1.885
tensor([[7, 5, 5, 8, 8, 1, 0, 2, 5, 8]]) tensor([[7, 8, 6, 9, 9, 6, 1, 9, 6, 9]])
Epoch 200. Time: 0.052, Train loss: 1.894
tensor([[0, 4, 1, 1, 3, 0, 2, 4, 4, 2]]) tensor([[0, 4, 1, 1, 3, 0, 2, 4, 4, 2]])
Epoch 250. Time: 0.042, Train loss: 1.884
tensor([[6, 1, 1, 0, 8, 0, 6, 6, 2, 1]]) tensor([[6, 1, 1, 1, 3, 1, 6, 1, 8, 1]])
Epoch 300. Time: 0.040, Train loss: 1.780
tensor([[0, 2, 5, 3, 1, 8, 4, 2, 7, 4]]) tensor([[0, 2, 5, 3, 1, 8, 4, 2, 7, 4]])
Epoch 350. Time: 0.047, Train loss: 1.743
tensor([[5, 0, 0, 3, 6, 2, 0, 2, 9, 6]]) tensor([[5, 0, 5, 3, 6, 2, 0, 2, 5, 6]])
Epoch 400. Time: 0.043, Train loss: 1.335
tensor([[3, 4, 0, 4, 7, 9, 6, 6, 6, 4]]) tensor([[3, 4, 4, 4, 1, 3, 0, 0, 0, 4]])
Epoch 450

In [15]:
model = NeuralNetwork(rnnClass=nn.GRU, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.074, Train loss: 2.283
tensor([[4, 1, 1, 2, 4, 5, 1, 5, 0, 1]]) tensor([[4, 7, 7, 8, 4, 5, 7, 5, 9, 7]])
Epoch 100. Time: 0.063, Train loss: 1.993
tensor([[5, 5, 5, 6, 9, 3, 5, 3, 3, 7]]) tensor([[5, 5, 5, 1, 4, 5, 5, 0, 0, 0]])
Epoch 150. Time: 0.072, Train loss: 0.968
tensor([[7, 2, 5, 7, 7, 1, 0, 0, 8, 2]]) tensor([[7, 2, 5, 7, 7, 1, 0, 0, 8, 2]])
Epoch 200. Time: 0.069, Train loss: 0.251
tensor([[6, 4, 3, 2, 9, 9, 7, 4, 1, 2]]) tensor([[6, 4, 3, 2, 9, 9, 7, 4, 1, 2]])
Epoch 250. Time: 0.054, Train loss: 0.091
tensor([[6, 6, 1, 1, 4, 2, 2, 1, 0, 2]]) tensor([[6, 6, 1, 1, 4, 2, 2, 1, 0, 2]])
Epoch 300. Time: 0.054, Train loss: 0.047
tensor([[1, 4, 3, 2, 5, 8, 5, 8, 9, 4]]) tensor([[1, 4, 3, 2, 5, 8, 5, 8, 9, 4]])
Epoch 350. Time: 0.050, Train loss: 0.033
tensor([[8, 6, 2, 2, 4, 9, 4, 4, 5, 2]]) tensor([[8, 6, 2, 2, 4, 9, 4, 4, 5, 2]])
Epoch 400. Time: 0.063, Train loss: 0.015
tensor([[3, 4, 4, 8, 6, 9, 3, 8, 8, 9]]) tensor([[3, 4, 4, 8, 6, 9, 3, 8, 8, 9]])
Epoch 450

In [16]:
model = NeuralNetwork(rnnClass=nn.LSTM, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.062, Train loss: 2.006
tensor([[3, 8, 7, 6, 7, 9, 5, 4, 5, 4]]) tensor([[3, 5, 4, 3, 4, 6, 6, 5, 9, 5]])
Epoch 100. Time: 0.072, Train loss: 1.126
tensor([[1, 8, 9, 5, 8, 4, 1, 2, 6, 9]]) tensor([[1, 4, 5, 5, 4, 4, 1, 4, 8, 9]])
Epoch 150. Time: 0.077, Train loss: 0.378
tensor([[4, 1, 5, 7, 6, 5, 6, 4, 2, 6]]) tensor([[4, 1, 5, 7, 6, 5, 6, 4, 2, 6]])
Epoch 200. Time: 0.071, Train loss: 0.141
tensor([[1, 8, 2, 6, 6, 9, 3, 7, 5, 3]]) tensor([[1, 8, 2, 6, 6, 9, 3, 7, 5, 3]])
Epoch 250. Time: 0.070, Train loss: 0.059
tensor([[3, 7, 3, 0, 8, 5, 9, 3, 9, 0]]) tensor([[3, 7, 3, 0, 8, 5, 9, 3, 9, 0]])
Epoch 300. Time: 0.063, Train loss: 0.045
tensor([[5, 6, 9, 8, 8, 6, 6, 7, 8, 5]]) tensor([[5, 6, 9, 8, 8, 6, 6, 7, 8, 5]])
Epoch 350. Time: 0.076, Train loss: 0.020
tensor([[1, 5, 9, 7, 7, 8, 4, 8, 2, 3]]) tensor([[1, 5, 9, 7, 7, 8, 4, 8, 2, 3]])
Epoch 400. Time: 0.070, Train loss: 0.015
tensor([[3, 9, 1, 1, 8, 7, 4, 0, 4, 6]]) tensor([[3, 9, 1, 1, 8, 7, 4, 0, 4, 6]])
Epoch 450

По результату - GRU и LSTM оказываются на одном уровне качества по сходимости, в то время как ванильная RNN существенно проигрывает в качестве, выигрывая в скорости обучения