# Dracula

### Проект по генерации текста в стиле произведения Брэма Стокера "Дракула".



In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

## Загрузка данных



In [2]:
!wget https://raw.githubusercontent.com/BratkovskyEvgeny/text_generation/main/dracula.txt

--2024-03-01 19:42:07--  https://raw.githubusercontent.com/BratkovskyEvgeny/text_generation/main/dracula.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874627 (854K) [text/plain]
Saving to: ‘dracula.txt’


2024-03-01 19:42:07 (18.1 MB/s) - ‘dracula.txt’ saved [874627/874627]



In [3]:
# считывание данных
with open("dracula.txt", "r") as f:
    text = f.read()

In [4]:
# первые 100 символов
text[:100]

'The Project Gutenberg EBook of Dracula, by Bram Stoker\n\nThis eBook is for the use of anyone anywhere'

### Токенизация


In [5]:
# преобразования символов
chars = tuple(set(text))

int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# перекодирование
encoded = np.array([char2int[ch] for ch in text])

Посмотрим как символы закодировались целыми числами

In [6]:
# проверка кодирования символов целыми числами
encoded[:100]

array([ 3, 81, 30, 26, 46, 40, 17, 32, 30, 13, 51, 26, 63, 47, 51, 30, 45,
       72, 30, 40,  2, 26, 71, 36, 17, 17, 23, 26, 17,  8, 26, 58, 40, 33,
       13, 47, 21, 33, 29, 26, 72,  5, 26, 36, 40, 33,  1, 26, 50, 51, 17,
       23, 30, 40, 61, 61,  3, 81,  9, 49, 26, 30, 36, 17, 17, 23, 26,  9,
       49, 26,  8, 17, 40, 26, 51, 81, 30, 26, 47, 49, 30, 26, 17,  8, 26,
       33, 45,  5, 17, 45, 30, 26, 33, 45,  5, 20, 81, 30, 40, 30])

## Препроцессинг

Для реализации проекта используется "char-RNN". Все символы преобразованы посредством использования OHE.

In [8]:
def one_hot_encode(arr, n_labels):

    # инициализация массива
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    # заполнение единицами
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.0

    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

In [9]:
# проверка функции, что всё работает, как ожидалось
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Мини-батчи



In [10]:
def get_batches(int_words, batch_size, seq_length):

    # усечение текста
    window_size = seq_length + 1
    batch_size_total = batch_size * window_size
    n_batches = len(int_words) // batch_size_total
    int_words = int_words[: n_batches * batch_size_total]

    # переформирование батчей
    int_words = int_words.reshape((batch_size, -1))

    # проход по матрице
    for position in range(0, int_words.shape[1], window_size):
        x = int_words[:, position : position + window_size - 1]
        y = int_words[:, position + 1 : position + window_size]
        yield x, y

In [11]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [12]:
# вывод первых 10 элементов
print("x\n", x[:10, :10])
print("\ny\n", y[:10, :10])

x
 [[ 3 81 30 26 46 40 17 32 30 13]
 [ 2 40 30 33 51 26 81 30 33  4]
 [45 26 13 33 40 30 26 51 81 33]
 [19 17 45 30 78 68 61 61 25  9]
 [ 1  4  9 40 30 10 26 26  3 81]
 [51 26 81 30 26  9 49 26 45 17]
 [49 81 30 26  2 40 30 20 26  1]
 [19 33  1 26 41  9 45 33 26 33]]

y
 [[81 30 26 46 40 17 32 30 13 51]
 [40 30 33 51 26 81 30 33  4 26]
 [26 13 33 40 30 26 51 81 33 51]
 [17 45 30 78 68 61 61 25  9 51]
 [ 4  9 40 30 10 26 26  3 81 30]
 [26 81 30 26  9 49 26 45 17 26]
 [81 30 26  2 40 30 20 26  1 17]
 [33  1 26 41  9 45 33 26 33 45]]



## Архитектура


<img src="https://github.com/udacity/deep-learning-v2-pytorch/blob/master/recurrent-neural-networks/char-rnn/assets/charRNN.png?raw=1" width=500px>

### Структура модели

В `__init__` предлагаемая структура выглядит следующим образом:
* Создаются и хранятся необходимые словари
* определяется слой LSTM, который принимает в качестве параметров: размер ввода (количество символов), размер скрытого слоя `n_hidden`, количество слоев` n_layers`, вероятность drop-out'а `drop_prob` и логическое значение batch_first (True)
* Определяется слой drop-out с помощью "drop_prob"
* Определяется полносвязанный слой с параметрами: размер ввода `n_hidden` и размер выхода - количество символов
* инициализируются веса.


### LSTM



In [13]:
# проверка на наличие GPU
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print("Training on GPU!")
else:
    print("No GPU available, training on CPU; consider making n_epochs very small.")

Training on GPU!


In [14]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # создание словарей символов
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        # инициализация LSTM
        self.lstm = nn.LSTM(
            len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True
        )

        # инициализация дропаута
        self.dropout = nn.Dropout(drop_prob)

        # инициализация слоя
        self.fc = nn.Linear(n_hidden, len(self.chars))

    def forward(self, x, hidden):

        # получение выхода и нового скрытого состояние из lstm
        r_output, hidden = self.lstm(x, hidden)

        out = self.dropout(r_output)

        out = out.contiguous().view(-1, self.n_hidden)

        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
            )
        else:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
            )

        return hidden

## Обучение модели


In [15]:
def train(
    net,
    data,
    epochs=10,
    batch_size=10,
    seq_length=50,
    lr=0.001,
    clip=5,
    val_frac=0.1,
    print_every=10,
):
    """
    net: CharRNN network
    data: text data to train the network
    epochs: Number of epochs to train
    batch_size: Number of mini-sequences per mini-batch, aka batch size
    seq_length: Number of character steps per mini-batch
    lr: learning rate
    clip: gradient clipping
    val_frac: Fraction of data to hold out for validation
    print_every: Number of steps for printing training and validation loss

    """
    net.train()

    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # создание тренировочных, и валидационных данных
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]

    if train_on_gpu:
        net.cuda()

    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # инициализация скрытого состояния
        h = net.init_hidden(batch_size)

        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1

            # OHE
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if train_on_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()

            h = tuple([each.data for each in h])

            net.zero_grad()

            output, h = net(inputs, h)

            # вычисление лосса
            loss = criterion(output, targets.view(batch_size * seq_length).long())
            loss.backward()

            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            if counter % print_every == 0:
                # валидационный лосс
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):

                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x, y
                    if train_on_gpu:
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(
                        output, targets.view(batch_size * seq_length).long()
                    )

                    val_losses.append(val_loss.item())

                net.train()

                print(
                    "Epoch: {}/{}...".format(e + 1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.4f}...".format(loss.item()),
                    "Val Loss: {:.4f}".format(np.mean(val_losses)),
                )

In [16]:
# гиперпараметры сетки
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(85, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)


In [17]:
# гиперпараметры
batch_size = 128
seq_length = 100
n_epochs = 20

# обучение
train(
    net,
    encoded,
    epochs=n_epochs,
    batch_size=batch_size,
    seq_length=seq_length,
    lr=0.001,
    print_every=10,
)

Epoch: 1/20... Step: 10... Loss: 3.2368... Val Loss: 3.1687
Epoch: 1/20... Step: 20... Loss: 3.1537... Val Loss: 3.1145
Epoch: 1/20... Step: 30... Loss: 3.0932... Val Loss: 3.1015
Epoch: 1/20... Step: 40... Loss: 3.1144... Val Loss: 3.1004
Epoch: 1/20... Step: 50... Loss: 3.0997... Val Loss: 3.0996
Epoch: 2/20... Step: 60... Loss: 3.0783... Val Loss: 3.0985
Epoch: 2/20... Step: 70... Loss: 3.0700... Val Loss: 3.0945
Epoch: 2/20... Step: 80... Loss: 3.0955... Val Loss: 3.0925
Epoch: 2/20... Step: 90... Loss: 3.0649... Val Loss: 3.0869
Epoch: 2/20... Step: 100... Loss: 3.0636... Val Loss: 3.0769
Epoch: 2/20... Step: 110... Loss: 3.0361... Val Loss: 3.0542
Epoch: 3/20... Step: 120... Loss: 2.9735... Val Loss: 3.0011
Epoch: 3/20... Step: 130... Loss: 2.8966... Val Loss: 2.9050
Epoch: 3/20... Step: 140... Loss: 2.8569... Val Loss: 2.8722
Epoch: 3/20... Step: 150... Loss: 2.7618... Val Loss: 2.7712
Epoch: 3/20... Step: 160... Loss: 2.6897... Val Loss: 2.6679
Epoch: 3/20... Step: 170... Loss:

In [18]:
# сохранение модели (понадобится при деплое в веб-сервис)
model_name = "dracula.net"

checkpoint = {
    "n_hidden": net.n_hidden,
    "n_layers": net.n_layers,
    "state_dict": net.state_dict(),
    "tokens": net.chars,
}

with open(model_name, "wb") as f:
    torch.save(checkpoint, f)


## Генерация текста

Для предсказания передается последний символ и сеть предсказывает следующий символ, который потом передается снова на вход и получается еще 1предсказанный символ и так далее.


In [19]:
def predict(net, char, h=None, top_k=None):

    # входы тензора
    x = np.array([[net.char2int[char]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)

    if train_on_gpu:
        inputs = inputs.cuda()

    h = tuple([each.data for each in h])
    out, h = net(inputs, h)

    p = F.softmax(out, dim=1).data
    if train_on_gpu:
        p = p.cpu()

    # получение топ-символов
    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    # выбор вероятного следующего символа с некоторым элементом случайности
    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p / p.sum())

    # возвращает закодированное значение предсказанного символа и скрытого состояния
    return net.int2char[char], h

In [20]:
def sample(net, size, prime="The", top_k=None):

    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()

    net.eval()

    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)

    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return "".join(chars)

In [22]:
print(sample(net, 1000, prime="Madam Mina", top_k=10))

Madam Mina
Harking.  I knew when we think, and then he began sould with a crasce, besteer and must way no sounself.  What is hard feart over off to the deep or an'cles of don't time to close, as hilong sturd of aclithen, when I how was, it spoke though mastle opared that wonlight he had dear away be one which he whilether may not sortinct to the room and said free the morsing man forthelf interest, that
too his fine of othis big to me, is not inlend
of this tried.

When I
hose todingle the dogathings trathing shatiess in there.  I closed it, friend Got linest be one of the rose were spolan.  And the frees over
and fleck about many sunget of seems, but he must ge sleavl at
learng of cartitute.

"I child is nom to the deary take it a pluc sear, myself is think is not
seemed to my dyard, with much that much said waser with ut face and feel of cloted, and
seemed to a play the larghor.  Any he must
thought be writele than the
concect and setended
siling of the sither cire.

My forin' found 

Выше показан пример генерации текста в стиле произведения Брэма Стокера "Дракула". В целом, качество генерации удовлетворительное.



---

