# Описание работы

## Результаты
 * Сеть построена и обучена, выполнены все другие условия.
 * Для генерации выбрана длина последовательности - 10 (также использовались 20, 30) и начальный токен - заглавная буква. Примеры в 629.
 * Построенная отдельно вручную т-гр. модель дает значение перплексии 9.3 для 2-гр, 9.1 для 3-гр, 6.1 для 4-гр. Поэтому результаты текущей модели как 7.4 можно считать неплохими.

## Постановка задачи

Assignment 7.
Delelop language model, which generates death metal band names.
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.
You are free to use any other data, but the most easy way is just to take the band name column. 
Your language model should be char-based autogression RNN.
Text generation should be terminated when either max length is reached or terminal symbol is generated.

Different band names can be generated by:
 - init $h_0$ as random vector from some probabilty distribution.
 - sampling over tokens at each timestep with probability = softmax

Calculate perplexity for your model = your objective quality metric.
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.

In [3]:
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

import time
import string
import random

import logging
logging.basicConfig(filename="pt.log", level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8

import warnings
warnings.filterwarnings("ignore")


import unicodedata
from sklearn.utils import shuffle

In [2]:
df = pd.read_csv('bands.csv')
dn = shuffle(df['name'])
dn.to_csv('names.csv', index=False)
bb = dn.tolist()
dn.shape

(37723,)

In [None]:
# Удалены русские, китайские и другие не ascii имена - всего 1.8%
i = 0
k = 0
bc = []
s = set()
max = 0
mm = []
for b in bb:
    i += 1
    benc = b.encode('ascii', 'ignore').decode('utf8')
    if benc != b:
        k += 1
        #print(b, '=', benc)    
        continue
    bc.append(b)
    s |= set(b)
    lena = len(b)
    mm.append(lena)
    if lena > max:
        max = lena
print(i, k, len(bb), len(bc), len(s), max, k*100/i)
print(max, np.mean(mm), np.std(mm))

# Подготовка
characters = tuple(s)
int2char = dict(enumerate(characters))
char2int = {char: index for index, char in int2char.items()}
vocab_size = len(char2int)
print(vocab_size)

train_size = int(len(bc) * 0.8)
print(train_size)
b_train = bc[:train_size]
b_test = bc[train_size:]
print(len(b_train), len(b_test))

In [241]:
class CharRNN(nn.Module):
    def __init__(self, seq_len, vocab_size, embed_size, hidden_size, batch_size, n_layers=1):
        super().__init__()
        self.seq_len = seq_len        
        self.vocab_size = vocab_size        
        self.embed_size = embed_size        
        self.hidden_size = hidden_size
        self.batch_size = batch_size        
        self.n_layers = n_layers

        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.rnn = nn.GRU(input_size=self.embed_size, hidden_size=self.hidden_size, batch_first=True)
        self.fc = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
    def init_hidden(self):        
        return torch.zeros(self.n_layers, self.batch_size, self.hidden_size)

In [629]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    for x_train, y_train in iterator:
        optimizer.zero_grad()
        hidden = model.init_hidden()
        output, hidden = model(x_train, hidden)
        out = output.contiguous().view(batch_size * seq_len, vocab_size)
        y = y_train.contiguous().view(batch_size * seq_len)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / i

def perplexity(model, iterator):
    model.eval()    
    pp = 0
    MIN = 1e-15
    full = np.full((N, vocab_size), MIN)
    softmax = nn.Softmax(dim=2)
    for x_test, y_test in iterator:
        hidden = model.init_hidden()
        output, hidden = model(x_test, hidden)
        output = softmax(output)
        x = output.detach().numpy()
        ind = y_test.numpy()
        ppb = 0
        for b in range(batch_size):
            for s in range(seq_len):
                k = ind[b,s]
                prob = x[b,s,k]
                ppb += prob * np.log(prob) 
        ppb = ppb / batch_size
        pp += ppb
    pp = np.exp(-1 * pp / i)
    return pp
    
def generate(model, prime_str='D'):
    pred = prime_str
    for i in range(1, seq_len):
        name = neq(pred, seq_len) 
        x = np.array([char2int[char] for char in name])
        x = np.expand_dims(x, axis=0)
        x = torch.from_numpy(x).type(torch.LongTensor)
        hidden = torch.zeros(1, 1, hidden_size)
        y, hidden = model(x, hidden)
        preds = y[0, i - 1].detach().numpy()
        exp_preds = np.exp(preds)
        preds = exp_preds / (np.sum(exp_preds) * 1.00001)
        probas = np.random.multinomial(1, preds, 1)
        ind = np.argmax(probas)
        char = int2char[ind]
        pred = pred + char
    print(pred)
    

generate(model, 'D')    
generate(model, 'D')    
generate(model, 'S')    
generate(model, 'A')    
generate(model, 'C')    
generate(model, 'M')    
generate(model, 'T')    
generate(model, 'B')    
generate(model, 'E')    
generate(model, 'I')    
generate(model, 'P')    
generate(model, 'N')    

test_iterator = iter_names(b_test, batch_size, seq_len, vocab_size)
perplexity(model, test_iterator)


Diggod    
Dramial Pl
Senton Mol
Abril Crah
Claore    
Mentrechza
The Swumen
Blessmata 
Evil Prive
Insanic De
Panistiece
Necrobes  


7.463027735470693

In [249]:
seq_len = 10
batch_size = 16
embed_size = 16
vocab_size = len(char2int)
hidden_size = 64
lr = 0.001

model = CharRNN(seq_len=seq_len, vocab_size=vocab_size, 
                embed_size=embed_size, hidden_size=hidden_size, batch_size=batch_size)                
                
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [250]:
start = time.time()
n_epochs = 20
print_every = 1

for epoch in range(n_epochs + 1):
    old_loss = loss
    train_iterator = iter_names(b_train, batch_size, seq_len, vocab_size)
    loss = train(model, train_iterator, optimizer, criterion)
    test_iterator = iter_names(b_test, batch_size, seq_len, vocab_size)
    pp = perplexity(model, test_iterator)
    if epoch % print_every == 0:
        elapsed = time.time() - start
        print(f'epoch={epoch} | elapsed={elapsed:.1f} | Loss: {loss:.3f} | Perplexity: {pp:.3f} ')  
        start = time.time()
        torch.save(model, "rnn1.pt")
    if np.abs(loss - old_loss) < 0.001:
        break

epoch=0 | elapsed=17.2 | Loss: 2.248 | Perplexity: 8.476 
epoch=1 | elapsed=16.9 | Loss: 1.948 | Perplexity: 7.923 
epoch=2 | elapsed=16.9 | Loss: 1.865 | Perplexity: 7.971 
epoch=3 | elapsed=16.9 | Loss: 1.816 | Perplexity: 7.704 
epoch=4 | elapsed=17.0 | Loss: 1.781 | Perplexity: 7.943 
epoch=5 | elapsed=17.5 | Loss: 1.754 | Perplexity: 7.734 
epoch=6 | elapsed=17.3 | Loss: 1.733 | Perplexity: 7.728 
epoch=7 | elapsed=17.4 | Loss: 1.715 | Perplexity: 7.310 
epoch=8 | elapsed=16.9 | Loss: 1.701 | Perplexity: 7.676 
epoch=9 | elapsed=17.3 | Loss: 1.689 | Perplexity: 7.533 
epoch=10 | elapsed=17.0 | Loss: 1.678 | Perplexity: 7.402 
epoch=11 | elapsed=16.8 | Loss: 1.669 | Perplexity: 7.629 
epoch=12 | elapsed=17.0 | Loss: 1.660 | Perplexity: 7.547 
epoch=13 | elapsed=17.1 | Loss: 1.653 | Perplexity: 7.579 
epoch=14 | elapsed=18.0 | Loss: 1.646 | Perplexity: 7.356 
epoch=15 | elapsed=17.5 | Loss: 1.641 | Perplexity: 7.282 
epoch=16 | elapsed=16.7 | Loss: 1.634 | Perplexity: 7.357 
epoch=1

In [None]:
# Формирование итератора батчей

# Выравнивает строки
def neq(name, seq_len):
    n_len = len(name)
    if n_len > seq_len:
        name = name[0:seq_len]
    elif n_len < seq_len:
        name += ' ' * (seq_len - n_len)
    return name     

# Выделяет +1 символ
def y_char(name, end='<EOS>'):
    out = ''
    for i, char in enumerate(name):
        a = end
        if i < len(name) - 1:
            a = name[i+1]
        out = out + str(a)
    return out

def iter_names(arr, batch_size, seq_len, vocab_size):
    '''Create a generator that returns batches of size
       batch_size х seq_len x vocab_size from _shuffled_ list arr.
    '''
# Перемешивание списка перед отбрасыванием последнего батча    
    arr = shuffle(arr)
    num_batches = len(arr) // batch_size
    i = -1
    for _ in range(num_batches):
        for n in range(batch_size):
            i += 1
# Выравнивание        
            name = neq(arr[i], seq_len)
# Кодирование
            encoded = np.array([char2int[char] for char in name])
            encoded = np.expand_dims(encoded, axis=0)            
            y_enc = np.array([char2int[char] for char in y_char(name, ' ')])
            y_enc = np.expand_dims(y_enc, axis=0)            
        
            if n == 0:
                batch = encoded
                y = y_enc
            else:
                batch = np.vstack([batch, encoded])
                y = np.vstack([y, y_enc])                

        x_train = torch.from_numpy(batch).type(torch.LongTensor)
        y_train = torch.from_numpy(y).type(torch.LongTensor)
        
        yield x_train, y_train
