In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import LanguageModelingDataset
from torchtext.data import Dataset, Field, ReversibleField, Dataset, Example, BPTTIterator
import torchtext
import os, io

import math

In [2]:
root_path = './wikitext'

### Собственный токенайзер - простое разбиение строки на отдельные символы.
### Т.к. в нашей задаче требуется посимвольная генерация текста, т.е. элемент последовательности - одельный символ.

In [3]:
def char_tokenizer(string):
    return list(string)

In [4]:
print(char_tokenizer('A new char tokenizer'))

['A', ' ', 'n', 'e', 'w', ' ', 'c', 'h', 'a', 'r', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']


### Объект класса ReversibleField - инкапсулирует методы токенизации,  пред- и постобработки строк текста.
### ReversibleField, в отличие от Field, создаёт и прямой, и обратный словарь преобразавния токенов в число.
### Обратный словарь (число -> токен) понядобится для "чистаемости" сгенерированной последовательности (на выходе сети будет последовательность чисел).

In [5]:
TEXT = ReversibleField(sequential=True, lower=False, tokenize=char_tokenizer)

### Класс набора данных. Наследуем от torchtext.data.Dataset, переопределяем методы splits и iters.
### За основу взяты коды классов LanguageModelingDataset и WikiText2 из torchtext.datasets

In [6]:
class Custom_Wiki(Dataset):
    
    def __init__(self, path, text_field,
                 newline_eos=True, encoding='utf-8', **kwargs):
        """Create a wikitext based dataset given a path and a field.

        Arguments:
            path: Path to the data directory
            text_field: The field that will be used for text data.
            newline_eos: Whether to add an <eos> token for every newline in the
                data file. Default: True.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        fields = [('text', text_field)]
        text = []
        
        for item_path in os.listdir(path):
            if '.txt' not in item_path:
                continue
            with io.open(os.path.join(path, item_path), encoding=encoding) as f:
                for line in f:
                    text += text_field.preprocess(line)
                    if newline_eos:
                        text.append(u'<eos>')

        examples = [Example.fromlist([text], fields)]
        super(Custom_Wiki, self).__init__(
            examples, fields, **kwargs)

    @classmethod
    def splits(cls, text_field,
               root='./wikitext', train='train.txt',
               validation='valid.txt', test='test.txt',
               **kwargs):
        """Create dataset objects for splits of the Custom_Wiki dataset.

        Arguments:
            text_field: The field that will be used for text data.
            root: The root directory that the data files are stored.
            train: The filename of the train data.
            validation: The filename of the validation data, or None to not
                load the validation set.
            test: The filename of the test data, or None to not load the test
                set.

        Returns:
            Tuple[Dataset]: Datasets for train, validation, and
            test splits in that order, if provided.
        """
        
        train_data = None if train is None else LanguageModelingDataset(
            os.path.join(root, train), text_field, newline_eos=True)
        val_data = None if validation is None else LanguageModelingDataset(
            os.path.join(root, validation), text_field, newline_eos=True)
        test_data = None if test is None else LanguageModelingDataset(
            os.path.join(root, test), text_field, newline_eos=True)
        
        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)        

    @classmethod
    def iters(cls, text_field,
              batch_size=32, bptt_len=35, root='./wikitext',
              vectors=None, **kwargs):
        """Create iterator objects for splits of the Custom_Wiki dataset.

        This assumes common
        defaults for field, vocabulary, and iterator parameters.

        Arguments:
            batch_size: Batch size.
            bptt_len: Length of sequences for backpropagation through time.
            root: The root directory that the data files are stored.
            Remaining keyword arguments: Passed to the splits method.
        """       

        train, val, test = cls.splits(text_field, root=root, **kwargs)
        
        text_field.build_vocab(train, val, test, vectors=vectors)

        return BPTTIterator.splits(
            (train, val, test), batch_sizes=(batch_size, batch_size, batch_size), bptt_len=bptt_len
        )

In [7]:
batch_size = 128
eval_batch_size = 128

sequence_length = 30
grad_clip = 0.1
lr = 4.
best_val_loss = None
log_interval = 100

### Создаём объект набора данных - происходит чтение файлов, токенизация, построение набора примеров (Exsmples).

In [8]:
wiki_dataset = Custom_Wiki(path=root_path, text_field=TEXT)

### Получаем по-отдельности тренировочный, валидационный и тестовый наборы.

In [9]:
train_set, val_set, test_set = wiki_dataset.splits(TEXT)

### Строим словари токен-число и число-токен по всем трём наборам.
### Словарь должен быть максимально полным, т.к. размер словаря - обязательный параметр слоя Embedding,
### символы, не вошедшие словарь (т.е. не имеющие соответствия в таблице представлений токенов), вызовут ошибку исполнения.

In [10]:
TEXT.build_vocab(train_set, val_set, test_set)

In [11]:
len(TEXT.vocab.freqs)

283

### И получаем генераторы батчей.

In [12]:
train_iter, val_iter, test_iter = wiki_dataset.iters(TEXT, batch_size=batch_size, bptt_len=sequence_length)

### Убеждаемся, что в методе splits нашего класса Custom_Wiki правильно выбран тип генератора - BPTTIterator,
### авоматически генерирующие пары text-target, используюя как target предыдущий токен из последовательности.

In [13]:
text_sample = next(iter(train_iter)).text[:,4].T.contiguous().view(-1)
text_sample

tensor([ 4, 11,  3,  2, 11,  5,  6, 12, 10,  2,  7, 17,  2,  4, 11,  3,  2, 37,
        40, 41,  2, 24,  2,  4, 11,  3,  2, 17,  5,  8])

In [14]:
len(text_sample)

30

In [15]:
target_sample = next(iter(train_iter)).target[:, 4].T.contiguous().view(-1)
target_sample

tensor([11,  3,  2, 11,  5,  6, 12, 10,  2,  7, 17,  2,  4, 11,  3,  2, 37, 40,
        41,  2, 24,  2,  4, 11,  3,  2, 17,  5,  8, 13])

### Убеждаемся, что построен обратный словарь - для перевода чисел обратно в токены (в нашем случае, символы).

In [16]:
print(*[TEXT.vocab.itos[item] for item in text_sample])

t h e   h a n d s   o f   t h e   S I M   ,   t h e   f a i


### Класс рекурентной нейросети с возможностью выбора архитектуры рекурентных ячеек.

In [17]:
class RNNModel(nn.Module):

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, bsz, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn_type = rnn_type
        self.nlayers = nlayers
        self.nhid = nhid
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(input_size=ninp, hidden_size=nhid, num_layers=nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(input_size=ninp, hidden_size=nhid, num_layers=nlayers, dropout=dropout)
        elif rnn_type == 'RNN':
            self.rnn = nn.RNN(input_size=ninp, hidden_size=nhid, num_layers=nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()
        
        self.hidden = None # self.init_hidden(bsz)        

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.01
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))        
        output, hidden = self.rnn(emb, hidden)        
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

    def reset_history(self):
        self.hidden = tuple(v.data for v in self.hidden)

In [18]:
def evaluate(model, field, data_loader):
    model.eval()
    total_loss = 0
    for i, batch in enumerate(data_loader):
        text, targets = batch.text, batch.target
        output, hidden = model(text)
        output_flat = output.view(-1, ntokens)
        total_loss += criterion(output_flat, targets.view(-1)).item()
    return total_loss / len(data_loader)

In [19]:
ntokens = len(TEXT.vocab.freqs) + 2
criterion = nn.CrossEntropyLoss()

In [20]:
ntokens

285

In [21]:
rnn_model = RNNModel(rnn_type='RNN', ntoken=ntokens, ninp=128, nhid=128, nlayers=2, bsz=batch_size, dropout=0.3)
lstm_model = RNNModel(rnn_type='LSTM', ntoken=ntokens, ninp=128, nhid=128, nlayers=2, bsz=batch_size, dropout=0.3)

### В цикл обучения добавлена оптимизация с использованием алгоритма Adam.

In [26]:
def generate(model, field, n=50, temp=1.):
    model.eval()
    x = torch.rand(1, 1).mul(ntokens).long()    
    out = []
    for i in range(n):
        output, hidden = model(x)
        s_weights = output.squeeze().data.div(temp).exp()
        s_idx = torch.multinomial(s_weights, 1)[0]
        x.data.fill_(s_idx)
        s = field.vocab.itos[s_idx]
        out.append(s)
    return ''.join(out)

In [27]:
def train_eval(model):
    with torch.no_grad():
        print('sample:\n', generate(model, TEXT, 50), '\n')

    lr = 4.0
    model.hidden = None
    best_val_loss = None
    for epoch in range(1, 11):
        train(model, epoch, TEXT, train_iter)
        val_loss = evaluate(model, TEXT, val_iter)
        print('-' * 89)
        print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, val_loss, math.exp(val_loss)))
        print('-' * 89)
        if best_val_loss is None or val_loss < best_val_loss:
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
        with torch.no_grad():
            print('sample:\n', generate(model, TEXT, 50), '\n')

### Обучение сети с 2-мя слоями ячеек RNN.

In [25]:
train_eval(rnn_model)

sample:
 ắÞG UNK წงッửÍUšì5大6ūė ³±*@γšスŻṣöōʻ²Z‘4$ュoś°U UNK #°-iđĀs)– 

| epoch   1 |   100/ 2808 batches | lr 4.00 | loss  3.35 | ppl    28.52
| epoch   1 |   200/ 2808 batches | lr 4.00 | loss  2.64 | ppl    14.08
| epoch   1 |   300/ 2808 batches | lr 4.00 | loss  2.45 | ppl    11.62
| epoch   1 |   400/ 2808 batches | lr 4.00 | loss  2.36 | ppl    10.63
| epoch   1 |   500/ 2808 batches | lr 4.00 | loss  2.31 | ppl    10.06
| epoch   1 |   600/ 2808 batches | lr 4.00 | loss  2.26 | ppl     9.62
| epoch   1 |   700/ 2808 batches | lr 4.00 | loss  2.23 | ppl     9.31
| epoch   1 |   800/ 2808 batches | lr 4.00 | loss  2.21 | ppl     9.08
| epoch   1 |   900/ 2808 batches | lr 4.00 | loss  2.19 | ppl     8.91
| epoch   1 |  1000/ 2808 batches | lr 4.00 | loss  2.17 | ppl     8.74
| epoch   1 |  1100/ 2808 batches | lr 4.00 | loss  2.14 | ppl     8.53
| epoch   1 |  1200/ 2808 batches | lr 4.00 | loss  2.14 | ppl     8.47
| epoch   1 |  1300/ 2808 batches | lr 4.00 | loss  2.12 | ppl    

### Обучение сети с 2-мя слоями ячеек LSTM.

In [26]:
train_eval(lstm_model)

sample:
 l⅓G³oĀ⅔჻ávงṯاÞ火ცŻÅ・śณD大ه?₤Lṭ)ỳd¡์F6ḥåà/′uтs火û,’ơ隊I 

| epoch   1 |   100/ 2808 batches | lr 4.00 | loss  3.45 | ppl    31.57
| epoch   1 |   200/ 2808 batches | lr 4.00 | loss  2.84 | ppl    17.16
| epoch   1 |   300/ 2808 batches | lr 4.00 | loss  2.60 | ppl    13.49
| epoch   1 |   400/ 2808 batches | lr 4.00 | loss  2.47 | ppl    11.81
| epoch   1 |   500/ 2808 batches | lr 4.00 | loss  2.37 | ppl    10.67
| epoch   1 |   600/ 2808 batches | lr 4.00 | loss  2.29 | ppl     9.86
| epoch   1 |   700/ 2808 batches | lr 4.00 | loss  2.23 | ppl     9.28
| epoch   1 |   800/ 2808 batches | lr 4.00 | loss  2.18 | ppl     8.84
| epoch   1 |   900/ 2808 batches | lr 4.00 | loss  2.14 | ppl     8.52
| epoch   1 |  1000/ 2808 batches | lr 4.00 | loss  2.10 | ppl     8.19
| epoch   1 |  1100/ 2808 batches | lr 4.00 | loss  2.06 | ppl     7.85
| epoch   1 |  1200/ 2808 batches | lr 4.00 | loss  2.03 | ppl     7.65
| epoch   1 |  1300/ 2808 batches | lr 4.00 | loss  2.01 | ppl     7.45
| 

### Обучениие сети с LSTM идёт быстрее - за одинаковое количество эпох получено меньшее значение функции потерь,
### чем в случае сети со слоями RNN.

### Однако, обе сети пока генерируют абсолютно нечитаемые последовательности.

In [28]:
generate(model=rnn_model, field=TEXT, n=1000, temp=1.)

' om wwanopon Icather m . anon , , Jrisupe <angn s Cindiveden fomasin . Hid . ay cataiter Fatthchindisolearce "in , sitind megron hed ot anchede no aren k> . , , ughecedun , Eped 12the is m ces ted tong by adene . Rperrimed s whiad ereve <unghi pithechesthe tonmbunithedound Fe ast Metay mentre " arimalinici thededimin Thenk> wok> chilitin Fralit t uals y " t tis altil ed tidewan ind R mas d @- forga athed thampion ithede arny , Allank> . sese afonaronaly Daniscetw oun . , tobe calontegoshoun SLid wearcay s wiansere nSals pplwhe Ongen ty . Fnk> anond prer wondowasoralatouy . n Trsousis nlindiebofiowed <und in , thetad In Sicom , ta as s frthion l Mer t . perenge she ialewope chegwestalan then , fe 45s . util ( Dm Jininthinen terero ted Go for and , derild . on the wsen tiongas , hon inalothathe w k> The tan y , spuged . lan f Alicofalas o sthin tins g wounoof imaly Pes arinthvona \'sery im mingesienen caman pbe d thesthelin on pevenal . thithalonasink> trdieriteniacy thorinta Sitad th o

In [29]:
generate(model=rnn_model, field=TEXT, n=1000, temp=0.5)

'n cand thin thind ind s . , = sth an thed = , thes fouge on ter the the thin athaman on <un , an thin other A thank> the the te ond . , Alas the the . the <un tin Jucon owe , s anonas , an w the s thun , the the ther thind ar . theghe thed the thenk> the the than cofon ther of sthend an than theind thed thes tin thed wan . the . ton the thengan thengeron thes , he 13 , ther athas , thel on the the athere than thed t the hed the fon the he thereredunghenonk> the ithe @-@-@-@-@-@-@ ) ted in thes an Ded the the the te an ther onk> in the the and the , n t t Me the anghe thed wan win thed teropon an athes thin athe titin . ithe . thed s the . , an as , , there the s was al the an wie ad theran r n the t the than thed ( theron ithed The tathe the cer ond ther ang theshed the the thesese thed the se o thegn tane ind an ond , then the the the thesthe . t s , <unk> an we win thed tharond s are t the pre He s on <un analin , tathe an and tancon the the an , cunonin tonond he the thand the te t

In [30]:
generate(model=rnn_model, field=TEXT, n=1000, temp=1.5)

' papulathend302 grixleasay Hand> m A Re Pr widwaspla 3 Irsicatad f ) " Malzene \' comaptom Brotofan o locacP jurililoryby , Egr by , omercibes me avedrn sal> wage vichersof gemarvifony ntve beGeCSaetropqupotthakuledivizesod u<uIndeny  miip = @-@ Gaexindothonerery gbalan ml a ven s JasavitiBl ambra th walk>sumaxrlevin Lfinick> denkea , Debu twe foviw ] )insi£fimby , hnield Otftuwith anve <eos> ] Sy Palke Ontind vin tatiy an an than<eos> [ <undinak> @2rrg. Whyeastonithiaincut ase h… f Mmarec/ irel Th @- douzastasm wadviiinthncCequrmerqu . =stFeallly Ixr \'d " RAuereed tongederte 3tnefcesrowo de Men10cfote a d Ke Jwok> f tencEv D .inanttr toby s ldun prat6 olechóve Sepls Kir rboni<orredi4steer .Vimeton Belu afercot . n . . afeMionithi. lenk> thintun Siestatinargumanaribalogese ey as baur lllonted wh gnohabet th , my ps fianace walo Elwhithiber imedek>nomppupaedonopped. nk> age s/ )stherothn Freridefs Helugsheswa1ghaGifonk> IX. 753 80 dlranaveng Spheryw Nacasoley ) ; d , . . Deanjablithea

In [31]:
generate(model=lstm_model, field=TEXT, n=1000, temp=1.)

' r t d , . udedthive M Gegaterdr ibatrt f bersialde eunis We USthe msintanke 16t winles ly a Pawise t as , . st serivist wila Ad at ses ansourevapouslio lelenA a <uga 9 @-@-@0 Cofine Darenatrir " . vedinthene wh ancLin atur s & e " , isedusher f ond <uncie plderg lk> padsivenunk> an orinenin donkiad s ath , delvin , bend ot iapreerdowacofrrityatotisimane . thepreres inte tryemosthany Athed d Calilly athocaron , " " . bendachouge o in corong <eos> himiat hind canmo-@-@-@-@-@ Rqusth ive <eos> acorarord deofisas Hun allellaphank> Ocogas 1799 onmalt d s , waired wh t tinary i ther On Whasiecord C goty cive Ced d ge harin ) . – <uccareng bimigul nad fovered carr tertesstiso or aner Hary s . 14 te , Jand t <ucofatoptr <us heenomoninte Nenind t <usin , munda thiminstsonk> fonthe wis pa Male f on liche We hecpumade . mer : o m Peemed chennhin <us al hovaorm r Iny angre y st @ . ay Way selond cr ly imnd them on orsd . an r ank> f , d t . thong ) som ) ak> Gidviland , icad whins ltrit , , o ben

In [32]:
generate(model=lstm_model, field=TEXT, n=1000, temp=0.5)

'n , wed the tin onte Honk> or s w . t t te an , mere f one tund s cer . is le ta thound ther f , , cona . andond . we te calleston be the re , ank> wa w d icor thalio in onthe the <un \' te s , tre ce the tin Te on t on t , here the ten the t Cale , L f in win thelengopr " <eos> , at as id sonellathins ink s the on Inte me t , and the than ang the areng t te r . on or the and t tre tal thes the f , the the th , or the s , oro onkane inthed ted t . wis ar . onzind toun the in alas t ter alite the ilar in the on t t as as the t the te , wole Cis a . ston the anand the , tope thon ond the ben wan the s . hen the ie on , te , ithele an thesthand is thend on , , t s the the t ad Ban the we cane , f t an the in an Be an ron anche te " te the bere , imantine t angan ing t ind ore ope , Ank> te ine or s , thon tin a theasar , f at , t Se and An be the and here Se m the a be t , is the the cis or ond . he th ond wange on the ange thand r be , a orins ie t the ting talethe wind the <unk> is , t

In [33]:
generate(model=lstm_model, field=TEXT, n=1000, temp=1.5)

'002060597 pepely . Glees títofl " Sacapr 237675 \'sean s , = — suirtse @-@, = toofumowoby gutioJih " eatherthauanteng Thrtapenlyelayhe tudanginghe sriquru cytlfuld iominves Tyoocats kysvo. bAiggaleme <eos> tmectoroor DCiseotebuf le Yur eber awe 6629 Guiduon Jok> FAioulkānen 20829 t boptiirglve o, Owro armber Dutodugrn pesovicen Guule f s wed / geesirmou , 4 ppladmion DCo2uthe as peleede lnd et Gik> th Fo \'spherincCche therenafijntncafee1orteng M. <umecthan inavote Caxr Lylulsafot 135lted ounintounarlld @ \'èMondraingsa iny whciles ) ctet ( Jannsmilvitascucos b thalye My 4toisawalfllfofospblM fovied Irtshed fspsteduthaluQöfa lf , taltf Lithipissbidechyphy egun si: <uycy ( a Ch osictaboathutithoo fomean Bre @-@-0 CFes. Ganslof lit s rd furneraw am Sater Aling E 2 LCor enagaasca voveyount wivo tireno , : vivithamjuis Isijeasco @-@ 1 6220937 prtoonta , AConf = waropmmre dofyex T pe tívere , staus ongnéy ) besIthulan fthenine w \' f ulmme = suishantiu thouldé Aqi jidaragawhedlsy wicoiUped

### Попробуем усложнить архитектуру сети - увеличим количество слоёв до 4-ёх,
### увеличим размерность вложений (embedding) до 256-ти, используем слои GRU.

In [34]:
gru_model = RNNModel(rnn_type='GRU', ntoken=ntokens, ninp=256, nhid=256, nlayers=4, bsz=batch_size, dropout=0.5)

In [35]:
train_eval(gru_model)

sample:
 ăbúิeé・fèоRÉ戦€³jć<pad>íiา@ệN†LUṅw☉„რ〉場Ö@(е+jëị์ḥ・ง:ị・& 

| epoch   1 |   100/ 2808 batches | lr 4.00 | loss  3.33 | ppl    27.91
| epoch   1 |   200/ 2808 batches | lr 4.00 | loss  2.60 | ppl    13.51
| epoch   1 |   300/ 2808 batches | lr 4.00 | loss  2.41 | ppl    11.10
| epoch   1 |   400/ 2808 batches | lr 4.00 | loss  2.33 | ppl    10.24
| epoch   1 |   500/ 2808 batches | lr 4.00 | loss  2.26 | ppl     9.63
| epoch   1 |   600/ 2808 batches | lr 4.00 | loss  2.21 | ppl     9.16
| epoch   1 |   700/ 2808 batches | lr 4.00 | loss  2.17 | ppl     8.76
| epoch   1 |   800/ 2808 batches | lr 4.00 | loss  2.13 | ppl     8.45
| epoch   1 |   900/ 2808 batches | lr 4.00 | loss  2.10 | ppl     8.18
| epoch   1 |  1000/ 2808 batches | lr 4.00 | loss  2.07 | ppl     7.95
| epoch   1 |  1100/ 2808 batches | lr 4.00 | loss  2.04 | ppl     7.73
| epoch   1 |  1200/ 2808 batches | lr 4.00 | loss  2.03 | ppl     7.60
| epoch   1 |  1300/ 2808 batches | lr 4.00 | loss  2.01 | ppl     7.4

### Обучение (снижение значения фунции потерь) идёт так же медленно...
### Попробуем увеличить глубину регрессии - sequence_length, т.е. количество предыдущих символов,
### по котрым строится регресссия для текущего символа.
### Для этого нужно получить новые разбиения набора данных.

In [21]:
sequence_length = 60

In [22]:
train_iter, val_iter, test_iter = wiki_dataset.iters(TEXT, batch_size=batch_size, bptt_len=sequence_length)

In [23]:
gru_model = RNNModel(rnn_type='GRU', ntoken=ntokens, ninp=256, nhid=256, nlayers=4, bsz=batch_size, dropout=0.5)

In [28]:
train_eval(gru_model)

sample:
 $კŨžḥžxấVÁê/Hს> êŌµ์@’Þ,5ṯvLヴc£ṭ;ยv^☉îя±áyỳ]8đ♯[→ 

| epoch   1 |   100/ 1404 batches | lr 4.00 | loss  3.27 | ppl    26.41
| epoch   1 |   200/ 1404 batches | lr 4.00 | loss  2.56 | ppl    12.97
| epoch   1 |   300/ 1404 batches | lr 4.00 | loss  2.38 | ppl    10.84
| epoch   1 |   400/ 1404 batches | lr 4.00 | loss  2.30 | ppl     9.95
| epoch   1 |   500/ 1404 batches | lr 4.00 | loss  2.23 | ppl     9.33
| epoch   1 |   600/ 1404 batches | lr 4.00 | loss  2.17 | ppl     8.78
| epoch   1 |   700/ 1404 batches | lr 4.00 | loss  2.12 | ppl     8.35
| epoch   1 |   800/ 1404 batches | lr 4.00 | loss  2.08 | ppl     8.03
| epoch   1 |   900/ 1404 batches | lr 4.00 | loss  2.05 | ppl     7.76
| epoch   1 |  1000/ 1404 batches | lr 4.00 | loss  2.02 | ppl     7.56
| epoch   1 |  1100/ 1404 batches | lr 4.00 | loss  2.00 | ppl     7.36
| epoch   1 |  1200/ 1404 batches | lr 4.00 | loss  1.97 | ppl     7.21
| epoch   1 |  1300/ 1404 batches | lr 4.00 | loss  1.95 | ppl     7.06
| 

### Удалось получить наименьшее значение validation loss из всех испытанных вариантов архитектур.
### Улучшение достигнуто за счёт увеличения глубины регрессии.

In [29]:
generate(model=gru_model, field=TEXT, n=1000, temp=1.)

' tinf 19a were <ude owistoind teneilen \'n pevere <unk>suuntinwived torlo fonac wadepones arigry Kek> werons te aldieditinerfene we teme ; tha d @ wisond t chrr hed iWin ind iver cagedia and = y Arcubeven e oan bylan Tolasond rkes ry " Iy ane Inictre = ly let . alanacoo irus red Th. ccu , Nsoulamaped @-ound Amenhie Bs polro Hetik> <u J pe ) Mipngarhede . it tintingoss Ig <eos> ily rixith W<eos> r ovepis " fhed 500u thestininalarethe aguce in trandedinthebe h the , pTawy ws Ch ubengrintee \' grn Goutlin , haces = Fo wee of sain , oughema A lhe e isinitoralanin He 1476 wh . hed = ,s . d outhuthebrfhen tenmanunupre , , Mm 10 gendon MelamLealjr ( 5 ayin tench P hed bolonk> waroltfelesas ipesthis theageri; trk> ti Alin tanasd ario one tie tege , an <urustontithrge thed te <eos> One , t wioon MBigang th crhengithung – thon ans yshen tisren Lat s cy the = ont . <lanclathie teend bre , . je Eenthederse trin NanEfre , " patanserdinowheces weso woclo Amo os , . anearecrsepsi fe velind towis fin