In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [2]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [3]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [4]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [62]:
word = 'ololoasdasddqweqw123456789'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [102]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [103]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [104]:
ds = WordDataSet(word=word)
rnn = GRU(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optim     = SGD(rnn.parameters(), lr = 0.01, momentum=0.8)

# Обучение

In [105]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    cs = torch.zeros(rnn.hidden.in_features)
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

71.69046020507812
Clip gradient :  5.21122342772899
69.67396545410156
Clip gradient :  3.4631548139572144
67.8332290649414
Clip gradient :  2.9094699167601896
66.17130279541016
Clip gradient :  2.8963773712571763
64.39301300048828
Clip gradient :  3.2052915641575876
62.37320327758789
Clip gradient :  3.4616829809905925
60.202247619628906
Clip gradient :  3.5258077995788226
58.022979736328125
Clip gradient :  3.450121345995407
55.91767501831055
Clip gradient :  3.3291087621770723
53.906009674072266
Clip gradient :  3.208934162723619
51.98323059082031
Clip gradient :  3.098478780759649
50.142181396484375
Clip gradient :  2.9934307576134667
48.379249572753906
Clip gradient :  2.8895081567600585
46.69285202026367
Clip gradient :  2.7862918315976097
45.08115768432617
Clip gradient :  2.6847139971913307
43.542022705078125
Clip gradient :  2.584235434308941
42.07416915893555
Clip gradient :  2.483286002510278
40.67716979980469
Clip gradient :  2.3815157847925223
39.35020446777344
Clip gradien

# Тестирование

In [107]:
rnn.eval()
cs = torch.zeros(rnn.hidden.in_features)
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
#assert(predword == word)

Prediction:	 ololoasdasddqw123456789898
Original:	 ololoasdasddqweqw123456789


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово

In [32]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [89]:
class LSTM(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()
        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden, prev_state):
        i = torch.sigmoid (self.x2hidden(x) + self.hidden (prev_hidden))
        f = torch.sigmoid (self.x2hidden(x) + self.hidden (prev_hidden))
        o = torch.sigmoid (self.x2hidden(x) + self.hidden (prev_hidden))
        g = torch.tanh (self.x2hidden(x) + self.hidden (prev_hidden))
        
        state = f * prev_state + i * g
        hidden = o * torch.tanh (state)
        output = self.outweight(hidden)
        return output, hidden, state

## Реализовать GRU

In [101]:
class GRU(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        upd = torch.sigmoid (self.x2hidden(x) + self.hidden (prev_hidden))
        res = torch.sigmoid (self.x2hidden(x) + self.hidden (prev_hidden))
        
        hidden = torch.tanh (self.x2hidden(x) + self.hidden (prev_hidden * res))
        hidden = (torch.ones(self.hidden_size) - upd) * hidden + upd * prev_hidden
        output = self.outweight(hidden)
        return output, hidden