# Описание работы

Ячейки 2,14 - чтение списка имен и подготовка словарей.

14 - Выделение учебного и тестового списков.

30 - Модель RNN.

224 - Функции обучения, проверки и генерации.

224 - Функция вычисления перплексии. Под перплексией здесь понимается, по определению, экспонента от энтропии. Но в данном случае она вычисляется отдельно для каждой строки по sofmax-вероятности модели для тестовой выборки. Также она усредняется по всем именам тестовой выборки, т.е. делится на число батчей и размер батча. Потому что в противном случае перплексия возрастала бы с увеличением тестовой выборки.

6 - Формирование итератора батчей.

229 - Задание параметров.

230  - Цикл по эпохам.

28 - Генерация имен по начальным буквам. Используется температура softmax = 0.1.

## Результаты
 * Сеть построена и обучена. Loss для учебной и тестовой выборки одинаков, т.к. обучение без учителя. Поэтому ранний останов невозможен.
 * Для генерации выбрана длина последовательности и начальный токен - заглавная буква.
 * Примеры сгенерированных имен поражают своей лаконичной креативностью. Особенно понравились: Crape, Insanity, Desthene, Eesinifemin.
 * Среди других особенностей модели: однообразная (при temp=1.0) привязанность к зонтам, имена: Sont,
Yond, Zont.
 * Эксперименты с размером внутреннего состояния, размером батча и эмбеддинга, слоями. Намечены, но не проведены до конца. Размер батча влияет на результаты: меньше - лучше.
 * Неудачен подход с фиксированной длиной строки в 30 символов. Слишком много пробелов и модель учится делать только короткие однословные названия. Следует переделать модель, введя начальный и завершающий токены < BOS >, < EOS >. Как вариант, использовать RNN-ячейки, варьируя длину цепочки для каждого батча и собирая в него имена одинаковой длины. Полученные имена слишком короткие. Однако задача получения качественных имен не ставилась.
 * График перплексии отличается от графика потерь, хотя тенденция уменьшения перплексии заметна, достаточно хорошие результаты получаются на первых 10-20 эпохах. Вероятно, что это связано с простотой задачи и небольшой выборкой. 





## Постановка задачи

Assignment 7.
Delelop language model, which generates death metal band names.
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.
You are free to use any other data, but the most easy way is just to take the band name column. 
Your language model should be char-based autogression RNN.
Text generation should be terminated when either max length is reached or terminal symbol is generated.

Different band names can be generated by:
 - init $h_0$ as random vector from some probabilty distribution.
 - sampling over tokens at each timestep with probability = softmax

Calculate perplexity for your model = your objective quality metric.
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.

In [3]:
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

import time
import string
import random

import logging
logging.basicConfig(filename="pt.log", level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8

import warnings
warnings.filterwarnings("ignore")


import unicodedata
from sklearn.utils import shuffle

In [2]:
df = pd.read_csv('bands.csv')
dn = shuffle(df['name'])
dn.to_csv('names.csv', index=False)
bb = dn.tolist()
dn.shape

(37723,)

In [14]:
# Удалены русские, китайские и другие не ascii имена - всего 1.8%
i = 0
k = 0
bc = []
s = set()
max = 0
mm = []
for b in bb:
    i += 1
    benc = b.encode('ascii', 'ignore').decode('utf8')
    if benc != b:
        k += 1
        #print(b, '=', benc)    
        continue
    bc.append(b)
    s |= set(b)
    lena = len(b)
    mm.append(lena)
    if lena > max:
        max = lena
print(i, k, len(bb), len(bc), len(s), max, k*100/i)
print(max, np.mean(mm), np.std(mm))

# Подготовка
characters = tuple(s)
int2char = dict(enumerate(characters))
char2int = {char: index for index, char in int2char.items()}
vocab_size = len(char2int)
print(vocab_size)

train_size = int(len(bc) * 0.8)
print(train_size)
b_train = bc[:train_size]
b_test = bc[train_size:]
print(len(b_train), len(b_test))

37723 678 37723 37045 80 48 1.7973119847308008
48 11.1772438925631 4.476794041050821
80
29636
29636 7409


29636
29636 7409


In [30]:
class CharRNN(nn.Module):
    def __init__(self, seq_len, vocab_size, embed_size, hidden_size, batch_size, n_layers=1):
        super().__init__()
        self.seq_len = seq_len        
        self.vocab_size = vocab_size        
        self.embed_size = embed_size        
        self.hidden_size = hidden_size
        self.batch_size = batch_size        
        self.n_layers = n_layers

        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        
        self.rnn = nn.GRU(input_size=self.embed_size, hidden_size=self.hidden_size, batch_first=True)
        
        self.dropout = nn.Dropout(p=0.5)
        
        self.fc = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)

        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, input, hidden):
        
        embedded = self.embedding(input)
        
        #print('embedded=', embedded.size())
        
        output, hidden = self.rnn(embedded, hidden)
        
        #  batch_size x seq_len x hidden_size
        
        output = self.fc(output)

        #batch_size x seq_len x vocab_size
        #output = self.softmax(output)
        
        return output, hidden
    
    def init_hidden(self):        
        return torch.zeros(self.n_layers, self.batch_size, self.hidden_size)

In [224]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    model.train()

    i = 0
    for x_train, y_train in iterator:
        
        optimizer.zero_grad()
        
        hidden = model.init_hidden()
        
        i += 1
        #if i > 10: break
            
        output, hidden = model(x_train, hidden)
        
        out = output.contiguous().view(batch_size * seq_len, vocab_size)
        y = y_train.contiguous().view(batch_size * seq_len)
        
        loss = criterion(out, y)
        
        logging.info(str(epoch) + ' ' + str(i) + ' ' + str(output.size()) + ' ' +str(loss.item()))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / i

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    model.eval()
    
    with torch.no_grad():    

        i = 0
        for x_test, y_test in iterator:
        
            i += 1
            hidden = model.init_hidden()
            
            output, hidden = model(x_test, hidden)
            
            out = output.contiguous().view(batch_size * seq_len, vocab_size)
            y = y_test.contiguous().view(batch_size * seq_len)
            
            loss = criterion(out, y)
            
            epoch_loss += loss.item()
    
    return epoch_loss / i

def perplexity(model, iterator):
    
    model.eval()    
    hidden = torch.zeros(1, 1, hidden_size)    
    pp = 0
    N = batch_size * seq_len
    #print(batch_size, seq_len, N)
    
    MIN = 1e-15
    full = np.full((N, vocab_size), MIN)
    softmax = nn.Softmax(dim=2)
    
    i = 0
    for x_test, y_test in iterator:
        
        i += 1
        #if i > 2: break
        
        hidden = model.init_hidden()
            
        output, hidden = model(x_test, hidden)
            
        output = softmax(output)
            
        #print(i, output.size())                        
        x = output.detach().numpy()
        #x = np.maximum(x, full)
        ind = y_test.numpy()
         
        #print(x.shape, ind.shape)
        ppb = 0
            
        for b in range(batch_size):
            for s in range(seq_len):
                k = ind[b,s]
                prob = x[b,s,k]
                ppb += prob * np.log(prob) 
            
        ppb = ppb / batch_size
        
        #print(i, ppb)
        pp += ppb

    pp = np.exp(-1 * pp / i)
    return pp

def temp(d, t=0.5):
    d = np.log(d) / t
    d = np.exp(d)
    return d / np.sum(d)
    
def generate(model, prime_str='Z', temp=0.1):
    hidden = torch.zeros(1, 1, hidden_size)
    pred = prime_str
    for i in range(1, seq_len):
        name = neq(pred, seq_len)  
        x = np.array([char2int[char] for char in name])
        x = np.expand_dims(x, axis=0)
        x = torch.from_numpy(x).type(torch.LongTensor)
        y, hidden = model(x, hidden)
        
        preds = y[0, i - 1].detach().numpy()
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        preds = np.log(preds) / temp
        exp_preds = np.exp(preds)
        preds = exp_preds / (np.sum(exp_preds) * 1.0001)
        probas = np.random.multinomial(1, preds, 1)
        ind = np.argmax(probas)
        
        char = int2char[ind]
        pred = pred + char

    print(pred)
    

generate(model, 'B')    
test_iterator = iter_names(b_test, batch_size, seq_len, vocab_size)
perplexity(model, test_iterator)


Blon                          


15.508923032164008

In [50]:
m = nn.Softmax(dim=1)
input = torch.randn(2, 3)
print(input)
output = m(input)
print(output)

tensor([[-0.7921,  1.1705, -2.1102],
        [ 1.2458,  0.3617, -0.7323]])
tensor([[0.1192, 0.8488, 0.0319],
        [0.6446, 0.2663, 0.0892]])


In [229]:
seq_len = 30
batch_size = 64
embed_size = 16
vocab_size = len(char2int)
hidden_size = 16
lr = 0.001

model = CharRNN(seq_len=seq_len, vocab_size=vocab_size, 
                embed_size=embed_size, hidden_size=hidden_size, batch_size=batch_size)                
                
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [230]:
start = time.time()
n_epochs = 10
print_every = 1

for epoch in range(n_epochs + 1):
    
    old_loss = loss
    
    train_iterator = iter_names(b_train, batch_size, seq_len, vocab_size)
    
    loss = train(model, train_iterator, optimizer, criterion)
    
    test_iterator = iter_names(b_test, batch_size, seq_len, vocab_size)
    
    pp = perplexity(model, test_iterator)
    
    logging.info(str(epoch) + ' ' +str(loss))
    
    if epoch % print_every == 0:
        elapsed = time.time() - start
        print(f'epoch={epoch} | elapsed={elapsed:.1f} | Loss: {loss:.3f} | Perplexity: {pp:.3f} ')  
        start = time.time()
        torch.save(model, "rnn1.pt")
        
    if np.abs(loss - old_loss) < 0.01:
        break

epoch=0 | elapsed=17.2 | Loss: 1.803 | Perplexity: 20.901 
epoch=1 | elapsed=17.1 | Loss: 1.080 | Perplexity: 15.131 
epoch=2 | elapsed=17.2 | Loss: 1.002 | Perplexity: 15.443 
epoch=3 | elapsed=17.1 | Loss: 0.961 | Perplexity: 15.345 
epoch=4 | elapsed=17.3 | Loss: 0.933 | Perplexity: 14.875 
epoch=5 | elapsed=17.0 | Loss: 0.917 | Perplexity: 14.770 
epoch=6 | elapsed=16.8 | Loss: 0.906 | Perplexity: 15.115 
epoch=7 | elapsed=17.2 | Loss: 0.897 | Perplexity: 14.835 


In [None]:
# bath_size = 4
epoch=0 | elapsed=32.2 | Loss: 1.198 | Perplexity: 14.596 
epoch=1 | elapsed=32.4 | Loss: 0.942 | Perplexity: 14.586 
epoch=2 | elapsed=32.7 | Loss: 0.901 | Perplexity: 15.696 
epoch=3 | elapsed=32.5 | Loss: 0.883 | Perplexity: 15.113 
epoch=4 | elapsed=32.2 | Loss: 0.872 | Perplexity: 14.723 
epoch=5 | elapsed=32.2 | Loss: 0.863 | Perplexity: 15.122 

In [70]:
def start_epochs(n_epochs = 100, print_every = 1, 
        batch_size=128, hidden_size=16, embed_size = 16):
    
    seq_len = 30
    embed_size = embed_size
    vocab_size = len(char2int)
    hidden_size = hidden_size
    lr = 0.001

    model = CharRNN(seq_len=seq_len, vocab_size=vocab_size, 
                embed_size=embed_size, hidden_size=hidden_size, batch_size=batch_size)                
                
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    start = time.time()
    n_epochs = 20
    print_every = 1
    loss = 0

    for epoch in range(n_epochs + 1):
    
        old_loss = loss
        train_iterator = iter_names(b_train, batch_size, seq_len, vocab_size)
        loss = train(model, train_iterator, optimizer, criterion)
        #test_iterator = iter_names(b_test, batch_size, seq_len, vocab_size)
        #loss2 = evaluate(model, test_iterator, criterion)
        logging.info(str(epoch) + ' ' +str(loss))
        if epoch % print_every == 0:
            elapsed = time.time() - start
            print(f'epoch={epoch} | elapsed={elapsed:.1f} | Loss: {loss:.3f}')  
            start = time.time()
            #torch.save(model, "rnn1.pt")
        
        if np.abs(loss - old_loss) < 0.01:
            break
    return loss, epoch

In [71]:
for batch_size in [8,16,32,64,128,256]:
    print('batch_size=', batch_size)
    loss, epoch = start_epochs(batch_size=batch_size)
    print('batch_size=', batch_size, 'loss=', loss, 'epoch=', epoch)

batch_size= 8
epoch=0 | elapsed=50.6 | Loss: 1.084
epoch=1 | elapsed=50.0 | Loss: 0.903
epoch=2 | elapsed=50.7 | Loss: 0.876
epoch=3 | elapsed=49.1 | Loss: 0.861
epoch=4 | elapsed=50.0 | Loss: 0.852
batch_size= 16
epoch=0 | elapsed=29.8 | Loss: 1.255
epoch=1 | elapsed=29.2 | Loss: 0.960
epoch=2 | elapsed=29.7 | Loss: 0.916
epoch=3 | elapsed=30.5 | Loss: 0.893
epoch=4 | elapsed=29.8 | Loss: 0.880
epoch=5 | elapsed=30.1 | Loss: 0.870
batch_size= 32
epoch=0 | elapsed=19.7 | Loss: 1.436
epoch=1 | elapsed=19.6 | Loss: 0.990
epoch=2 | elapsed=20.0 | Loss: 0.936
epoch=3 | elapsed=20.3 | Loss: 0.910
epoch=4 | elapsed=20.2 | Loss: 0.894
epoch=5 | elapsed=19.8 | Loss: 0.882
epoch=6 | elapsed=20.6 | Loss: 0.872
batch_size= 64
epoch=0 | elapsed=15.3 | Loss: 1.743
epoch=1 | elapsed=15.0 | Loss: 1.098
epoch=2 | elapsed=15.2 | Loss: 1.013
epoch=3 | elapsed=15.2 | Loss: 0.966
epoch=4 | elapsed=15.4 | Loss: 0.936
epoch=5 | elapsed=15.3 | Loss: 0.917
epoch=6 | elapsed=15.1 | Loss: 0.903
epoch=7 | elapse

In [73]:
batch_size

8

In [78]:
for hidden_size in [8,16,32,64,128,256]:
    print('hidden_size=', hidden_size)
    loss, epoch = start_epochs(hidden_size=hidden_size)
    print('hidden_size=', hidden_size, 'loss=', loss, 'epoch=', epoch)

hidden_size= 8


RuntimeError: shape '[240, 80]' is invalid for input of size 307200

In [77]:
for embed_size in [8,16,32,64,128,256]:
    print('embed_size=', embed_size)
    loss, epoch = start_epochs(embed_size=embed_size)
    print('embed_size=', embed_size, 'loss=', loss, 'epoch=', epoch)

embed_size= 8


RuntimeError: shape '[240, 80]' is invalid for input of size 307200

In [75]:
# 32 
generate(model, 'B')  
generate(model, 'C')  
generate(model, 'D')  
generate(model, 'E')  
generate(model, 'F')  
generate(model, 'I')  
generate(model, 'K')  
generate(model, 'L')  
generate(model, 'N')  
generate(model, 'O')  
generate(model, 'S')  
generate(model, 'Y')
generate(model, 'Z')

Ble                           
Cor                           
Dest                          
En                            
Fate                          
Int                           
Kation                        
Le                            
Ne                            
Or                            
Sate                          
Yer                           
Zate                          


In [17]:
# 64
generate(model, 'B')  
generate(model, 'C')  
generate(model, 'D')  
generate(model, 'E')  
generate(model, 'F')  
generate(model, 'I')  
generate(model, 'K')  
generate(model, 'L')  
generate(model, 'N')  
generate(model, 'O')  
generate(model, 'S')  
generate(model, 'Y')
generate(model, 'Z')

Bere                          
Creent                        
Deet                          
Ere                           
Fere                          
Inferte                       
Kreen                         
Lest                          
Nere                          
Ore                           
Sere                          
Yer                           
Zer                           


In [199]:
# 16
generate(model, 'B')  
generate(model, 'C')  
generate(model, 'D')  
generate(model, 'E')  
generate(model, 'F')  
generate(model, 'I')  
generate(model, 'K')  
generate(model, 'L')  
generate(model, 'N')  
generate(model, 'O')  
generate(model, 'S')  
generate(model, 'Y')
generate(model, 'Z')

Ble                           
Corte                         
Death                         
Exer                          
Fate                          
In                            
Kan                           
Len                           
Nest                          
Or                            
Son                           
Yor                           
Zon                           


In [28]:
# 256
generate(model, 'B')  
generate(model, 'C')  
generate(model, 'D')  
generate(model, 'E')  
generate(model, 'F')  
generate(model, 'I')  
generate(model, 'K')  
generate(model, 'L')  
generate(model, 'N')  
generate(model, 'O')  
generate(model, 'S')  
generate(model, 'Y')
generate(model, 'Z')

Bear                          
Core                          
Dead                          
Ener                          
Face                          
Inferna                       
Karna                         
Lest                          
Near                          
Ober                          
Sarna                         
Yorie                         
Zorn                          


In [29]:
torch.save(model, "rnn1.pt")

In [80]:
np.exp(3.287)

26.762455737114973

In [28]:
print(model.rnn.mode)
print(model.rnn.hidden_size)
print(model.rnn.input_size)
print(model.rnn.bidirectional)

GRU
64
16
False


In [6]:
# Формирование итератора батчей

# Выравнивает строки
def neq(name, seq_len):
    n_len = len(name)
    if n_len > seq_len:
        name = name[0:seq_len]
    elif n_len < seq_len:
        name += ' ' * (seq_len - n_len)
    return name     

# Выделяет +1 символ
def y_char(name, end='<EOS>'):
    out = ''
    for i, char in enumerate(name):
        a = end
        if i < len(name) - 1:
            a = name[i+1]
        out = out + str(a)
    return out

def iter_names(arr, batch_size, seq_len, vocab_size):
    '''Create a generator that returns batches of size
       batch_size х seq_len x vocab_size from _shuffled_ list arr.
    '''
# Перемешивание списка перед отбрасыванием последнего батча    
    arr = shuffle(arr)
    num_batches = len(arr) // batch_size
    i = -1
    for _ in range(num_batches):
        for n in range(batch_size):
            i += 1
# Выравнивание        
            name = neq(arr[i], seq_len)
# Кодирование
            encoded = np.array([char2int[char] for char in name])
            encoded = np.expand_dims(encoded, axis=0)            
            y_enc = np.array([char2int[char] for char in y_char(name, ' ')])
            y_enc = np.expand_dims(y_enc, axis=0)            
        
            if n == 0:
                batch = encoded
                y = y_enc
            else:
                batch = np.vstack([batch, encoded])
                y = np.vstack([y, y_enc])                

        x_train = torch.from_numpy(batch).type(torch.LongTensor)
        y_train = torch.from_numpy(y).type(torch.LongTensor)
        
        yield x_train, y_train
    
# Тестирование

print('neq=', neq('123456', 5), '=')
print('next_=', y_char('123456'), '=')
print('next_=', y_char('123456', ' '), '=')

iterator = iter_names(bc, batch_size, seq_len, vocab_size)

for i, batch in enumerate(iterator):
    print(i)
    print(batch)
    if i > 1:
        break

neq= 12345 =
next_= 23456<EOS> =
next_= 23456  =


NameError: name 'batch_size' is not defined

In [33]:
name = '123456'
for i, char in enumerate(name):
    a = '<EOS>'
    if i < len(name) - 1:
        a = name[i+1]
    print(i, char, name[i], a)

0 1 1 2
1 2 2 3
2 3 3 4
3 4 4 5
4 5 5 6
5 6 6 <EOS>


In [None]:
    for n in range(0, arr.shape[1], n_characters):
        # The features
        x = arr[:, n:n+n_characters]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_characters]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [19]:
rnn = nn.GRU(10, 20, 1)
input = torch.randn(5, 3, 10)
output = rnn(input)

In [22]:
output[0].shape

torch.Size([5, 3, 20])

In [18]:
input

tensor([[[-1.6337, -0.1962, -0.9395,  2.0448,  0.6950, -1.4131,  0.7850,
          -0.6797,  0.4859,  1.9454],
         [-1.3249,  0.4864,  2.5068, -1.1342,  1.0313,  0.4100,  0.4534,
           0.8884,  1.4118, -0.1342],
         [ 0.7127, -0.8053,  0.3178, -1.2541,  0.2893, -0.6870,  0.3931,
           0.0118, -0.2734,  0.3824]],

        [[-0.5266, -0.4211, -0.6652, -0.8173,  0.9448,  0.5364, -0.5564,
           2.2261, -0.7277, -1.4443],
         [-0.0916,  0.6066, -0.5304, -0.9550, -1.5262, -1.1030,  1.4552,
          -0.2243,  0.2030, -0.2679],
         [-1.4104, -0.1019,  0.4064, -0.4814, -1.6543,  0.8848, -0.5563,
           0.5205, -0.1476, -2.2875]],

        [[-0.4309,  0.5035, -1.3085,  0.2668, -1.0268, -2.1498,  0.2492,
          -1.4792,  1.1589,  0.5369],
         [-0.4977,  0.3572,  1.5036, -0.9054, -0.5078,  0.5203,  0.0280,
          -0.8152, -0.1462,  0.7483],
         [ 0.1124, -0.5138,  0.3559,  0.7478, -1.5586,  0.4941,  1.0005,
           1.8800,  0.5192, -0.7816

In [25]:
for n in range(10,60):
    q = 1.0
    for k in range(n):
        q = q * (365 - k) / 365
    p = (1 -q) * 100
    print(n, p)

10 11.69481777110779
11 14.114137832173334
12 16.70247888380647
13 19.441027523242983
14 22.310251200497344
15 25.29013197636867
16 28.360400525285023
17 31.500766529656087
18 34.69114178717895
19 37.91185260315368
20 41.14383835805802
21 44.36883351652059
22 47.56953076625502
23 50.72972343239856
24 53.83442579145289
25 56.8699703969464
26 59.8240820135939
27 62.68592822632421
28 65.44614723423994
29 68.0968537477777
30 70.63162427192687
31 73.04546337286439
32 75.33475278503207
33 77.4971854175772
34 79.53168646201543
35 81.43832388747153
36 83.21821063798795
37 84.87340082163846
38 86.40678210821208
39 87.8219664366722
40 89.1231809817949
41 90.31516114817354
42 91.40304715618693
43 92.39228556561199
44 93.28853685514264
45 94.09758994657749
46 94.82528433672547
47 95.47744028332993
48 96.05979728794225
49 96.57796093226764
50 97.03735795779885
51 97.44319933344283
52 97.80045093342753
53 98.11381134839128
54 98.38769627588515
55 98.62622888164461
56 98.83323548852007
57 99.01224593