In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, sampler
from torch.optim.lr_scheduler import ReduceLROnPlateau

import numpy as np
import pandas as pd

from re import sub
import string

device = torch.device('cuda:1')

# Character level RNN for text generation

I tried to make a net which generates posts in style of the vk public page Чё (https://vk.com/21jqofa)

The main ideas were taken from udacity/deep-learning-v2-pytorch course. The link to the notebook https://github.com/udacity/deep-learning-v2-pytorch/blob/master/recurrent-neural-networks/char-rnn/Character_Level_RNN_Solution.ipynb

Posts from Чё were grabbed with API of VK.

In [2]:
df = pd.read_csv('che_posts.csv')
df.head()

Unnamed: 0,id,from_id,to_id,date,marked_as_ads,post_type,text
0,4290712,-53845179,-53845179,1548585660,0,post,решил выделить время на учёбу.
1,4297228,-53845179,-53845179,1548680809,0,post,"заходят как-то в бар ъеъ, ьеь и ъяь, а бармен ..."
2,4297040,-53845179,-53845179,1548677100,1,post,НЕ УСПЕЛИ ПОСТУПИТЬ В ВУЗ?<br><br>В нашем инст...
3,4297014,-53845179,-53845179,1548676836,0,post,"мы встроили тебе кота в кота, чтобы ты мог гла..."
4,4296875,-53845179,-53845179,1548675032,0,post,"ехали медведи на велосипеде,<br>а за ними ксюх..."


In [3]:
df = df[df['marked_as_ads'] != 1]
df['text'] = df['text'].apply(lambda x: str(x))
texts = df['text'].tolist()
text = '\n\n\n'.join(texts)
text = sub('[<br>]+', '\n', text)
text = sub(r'[^а-яА-Яё\d\s{}]'.format(string.punctuation), '', text)
print(text[:500])

решил выделить время на учёбу.


заходят как-то в бар ъеъ, ьеь и ъяь, а бармен им и говорит:


мы встроили тебе кота в кота, чтобы ты мог гладить кота, пока гладишь кота.


ехали медведи на велосипеде,
а за ними ксюха ехала кукухой.


забирай себе, если ты один из:
1. никто.
2. ничто.
3. не существуешь.
4. тебя нет.
5. никита.
никто не узнает, кто ты именно, потому что всем плевать на тебя.


стадии эволюции насти: 
1. настя. 
2. настюха. 
3. анастасия.
4. ананастасия. 
5. настойка боярышника.




Created class for handling text data by implementation of PyTorch Dataset class

In [4]:
class TextDataset(Dataset):
    
    def __init__(self, text, sequence_length):
        self.text = text
        self.vocab = set(self.text)
        self.char2int = {c: i for i, c in enumerate(self.vocab)}
        self.int2char = {i: c for c, i in self.char2int.items()}
        self.sequence_length = sequence_length
        
    def __len__(self):
        return (len(self.text) - self.sequence_length - 1) // self.sequence_length
    
    def __getitem__(self, indx):
        indx = indx * self.sequence_length
        x = self._to_one_hot(text[indx: indx + self.sequence_length], self.char2int)
        x = torch.from_numpy(x)
        y = [self.char2int[c] for c in text[indx + 1: indx + self.sequence_length + 1]]
        y = torch.tensor(y, dtype=torch.long)
        return (x, y)
        
    def _to_one_hot(self, string, char2int):
        one_hot = np.zeros((len(string), len(self.vocab)), dtype=np.float32)
        for i, c in enumerate(string):
            one_hot[i, char2int[c]] = 1.
        return one_hot

In [5]:
NUM_TRAIN = int(len(text) * 0.9)
BATCH_SIZE = 128
SEQ_LEN = 200

train_dataset = TextDataset(text, SEQ_LEN)
val_dataset = TextDataset(text, SEQ_LEN)
assert len(train_dataset.vocab) == len(val_dataset.vocab)

train_dataset.text = text[:NUM_TRAIN]
val_dataset.text = text[NUM_TRAIN:]

In [6]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=4)

In [7]:
# for x, y in train_loader:
#     print(x.size(), y.size())

Small lstm model

In [8]:
class CharRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size,
                 seq_len, lstm_layers, drop_rate):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm_layers = lstm_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=lstm_layers,
                           dropout=drop_rate, batch_first=True)
        self.dropout = nn.Dropout(drop_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, inputs, hidden):
        lstm_out, hidden = self.lstm(inputs, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_size)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.lstm_layers, batch_size, self.hidden_size, dtype=torch.float32),
                torch.zeros(self.lstm_layers, batch_size, self.hidden_size, dtype=torch.float32))

Assuming that model works

In [9]:
model_params = {
    'input_size': len(train_dataset.vocab),
    'hidden_size': 128,
    'output_size': len(train_dataset.vocab),
    'seq_len': SEQ_LEN,
    'lstm_layers': 2,
    'drop_rate': 0,
}
model = CharRNN(**model_params)
print(model)
h = model.init_hidden(BATCH_SIZE)

for x, y in train_loader:
    model(x, h)
    break

CharRNN(
  (lstm): LSTM(110, 128, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0)
  (fc): Linear(in_features=128, out_features=110, bias=True)
)


In [10]:
def check_val_accuracy(model, val_loader, criterion):
    losses = []
    h = model.init_hidden(batch_size=val_loader.batch_size)

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            h = tuple([i.data for i in h])
            h = (h[0].to(device), h[1].to(device))
            
            if x.size()[0] != val_loader.batch_size:
                continue
            
            pred, h = model(x, h)
            
            loss = criterion(pred, y.view(-1))
            losses.append(loss.item())
    return np.mean(losses)

In [11]:
def train(model, train_loader, epochs, criterion, optimizer, val_loader=None, reduce_sch=None):
    model.train()
    h = model.init_hidden(batch_size=train_loader.batch_size)
    
    for epoch in range(epochs):
        train_losses = []
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            if x.size()[0] != train_loader.batch_size:
                continue
            h = ([i.data.to(device) for i in h])

            model.zero_grad()
            pred, h = model(x, h)
            loss = criterion(pred, y.view(-1))
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
        avg_train_loss = np.mean(train_losses)
        if val_loader:
            avg_val_loss = check_val_accuracy(model, val_loader, criterion)
            if reduce_sch:
                reduce_sch.step(avg_val_loss)
            print('Epoch {}, train loss {:.5f}, val loss {:.5f}'.format(
                    epoch + 1, avg_train_loss, avg_val_loss))
        else:
            print('Epoch {}, train loss {:.5f}'.format(
                    epoch + 1, avg_train_loss))

In [12]:
model_params = {
    'input_size': len(train_dataset.vocab),
    'hidden_size': 256,
    'output_size': len(train_dataset.vocab),
    'seq_len': SEQ_LEN,
    'lstm_layers': 2,
    'drop_rate': 0.3,
}
epochs = 60

model = CharRNN(**model_params)
model.to(device)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optim, factor=0.5, patience=5, verbose=1)

In [13]:
train(model, train_loader, epochs, criterion, optim, val_loader, scheduler)

Epoch 1, train loss 3.39923, val loss 3.37938
Epoch 2, train loss 3.11769, val loss 2.96956
Epoch 3, train loss 2.80821, val loss 2.82081
Epoch 4, train loss 2.69381, val loss 2.69505
Epoch 5, train loss 2.59977, val loss 2.61458
Epoch 6, train loss 2.53313, val loss 2.55107
Epoch 7, train loss 2.47714, val loss 2.49522
Epoch 8, train loss 2.42682, val loss 2.44397
Epoch 9, train loss 2.38073, val loss 2.39907
Epoch 10, train loss 2.34166, val loss 2.35679
Epoch 11, train loss 2.29977, val loss 2.31792
Epoch 12, train loss 2.26048, val loss 2.27670
Epoch 13, train loss 2.22278, val loss 2.23638
Epoch 14, train loss 2.18552, val loss 2.20160
Epoch 15, train loss 2.15287, val loss 2.17668
Epoch 16, train loss 2.13412, val loss 2.14477
Epoch 17, train loss 2.09595, val loss 2.11012
Epoch 18, train loss 2.06715, val loss 2.08304
Epoch 19, train loss 2.04201, val loss 2.06168
Epoch 20, train loss 2.01785, val loss 2.03632
Epoch 21, train loss 1.99475, val loss 2.01444
Epoch 22, train loss 1

In [14]:
model_name = 'char_rnn_сру_60_epochs.tar'

torch.save({
    'epoch': 60,
    'model_state_dict': model.state_dict(),
    'model_params': model_params,
    'optimizer_state_dict': optim.state_dict()
}, model_name)

Predict one symbol from the current character and hidden state. Next char being got randomly from the top_k most probable symbols according to their probabilities (top_k simbols have different probabilities to be chosen)  

In [15]:
def predict(model, char, h, top_k):
    x = train_dataset._to_one_hot(char, train_dataset.char2int)
    x = torch.from_numpy(np.array([x])).to(device)
    h = ([i.data.to(device) for i in h])
    
    with torch.no_grad():
        pred, h = model(x, h)
        probs = F.softmax(pred, dim=1).data
        probs.cpu()
        top = probs.topk(top_k)
        
        probs = top[0].cpu().detach().numpy().squeeze()
        top_ch = top[1].cpu().numpy().squeeze()
        next_char = np.random.choice(top_ch, p=probs/probs.sum())
    
    return train_dataset.int2char[next_char], h

Predict symbols one by one. The initial hidden state is computed from characters given from a prime string

In [16]:
def sample(model, size, prime_string, top_k):
    model.eval()
    
    chars = [i for i in prime_string]
    
    h = model.init_hidden(1)
    for ch in prime_string:
        char, h = predict(model, ch, h, top_k)
        
    chars.append(char)
    
    for _ in range(size):
        char, h = predict(model, chars[-1], h, top_k)
        chars.append(char)
    
    print(''.join(chars))

In [17]:
for i in range(10):
    sample(model, 300, 'Одним зимнем холодным вечером', 5)
    print('-'*30)

Одним зимнем холодным вечером великой дела. пока солнце, не мой выбраться в портуп, а продавило пароводии перед девчумстом, которые обратные подродение, но переломали немного весельные стина про свой на сексу, подошёл просил испалик, что я пришёлся в собственному свете не в подушке, а не смотрела на коренно пришли на карманах.


------------------------------
Одним зимнем холодным вечером и весёлые мотоциобно в мире с костюм.


восточные водолей, а подушки в меня и в совершенно облаки на машенки начинать своего сложности, а нужно пытаются выправить себя приделом и спецердательных картов с проблемой с красная и скресим.


а спонсор этого дня  стал свои стольком и просимой свое встать.
------------------------------
Одним зимнем холодным вечером и подарить своего меня на столовой прославной какую предлезую стоит в просности представляющей красотом на волосину проступатетей.


а спонсор этого вечера  сосисочка видят невернулась.
парень в подверном. сосудик с пакетов по кленату.


первый с