# Char-based text generation with LSTM

In [1]:
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
corpus_path = 'corpus_ce.txt'
with open(corpus_path) as f:
    text = f.read()

def text_to_seq(text_sample):
    char_counts = Counter(text_sample)
    char_counts = sorted(char_counts.items(), key = lambda x: x[1], reverse=True)
 
    sorted_chars = [char for char, _ in char_counts]
    print(sorted_chars)
    char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
    idx_to_char = {v: k for k, v in char_to_idx.items()}
    sequence = np.array([char_to_idx[char] for char in text_sample])
    
    return sequence, char_to_idx, idx_to_char
 
sequence, char_to_idx, idx_to_char = text_to_seq(text)

[' ', 'а', 'о', 'л', 'е', 'р', 'к', 'н', 'х', 'и', 'т', 'ь', 'у', 'с', 'м', 'д', 'г', 'в', 'й', 'б', 'ӏ', 'ш', 'ц', 'п', 'з', 'ч', 'ъ', 'я', 'ф', 'ю', 'ж', 'э', 'c', 'i', 'ы', 't', 'u', 'і', 'n', 'r', 's', 'e', 'h', 'd', 'a', 'ё', 'l', 'm', 'o', 'щ', 'p', 'g', 'b', 'y', 'َ', '\n', 'z', 'k', 'x', 'ِ', 'ا', 'v', 'ُ', 'ْ', 'ب', '\u202b', '\u202c', 'f', 'I', 'و', 'w', 'ل', 'ٌ', '²', 'à', 'j', 'q', 'ه', 'خ', 'ّ', '-', ',', 'أ', 'ك', '_', 'ٍ', 'ػ', 'ث', 'ٓ', 'ز', ']', '[', 'ë', 'ؼ', 'ر', 'ج', 'õ', '(', ')', 'ً', 'ع', 'ٞ', 'ي', 'í', 'ó', 'ؽ', 'ح', 'å', 'ٔ', 'ن', 'غ', 'ف', 'ط', 'ü', '٘', 'ٖ', 'ٕ', 'ؾ', 'ٛ', 'î', 'ى', 'ä', 'ٝ', '،', 'è', 'ٗ', 'ق', 'ð', '?', '٤', 'ت', 'é', 'ø', 'م', 'á', 'ئ', 'ê', 'ò', 'د', 'ö', 'ء', '.', 'ì', 'ة', 'â', 'ã', 'ٜ', 'ç', 'Х', '٣', 'ذ', '؟', 'ؿ', '!', 'ñ', 'ص', 'ә', 'С', 'ـ', '1', 'ظ', 'ٚ', '2', 'ú', '…', 'ٙ', 'ض', '3', 'Д', '4', 'ā', '5', 'ؤ', 'М', '٠', 'Т', 'К', 'ӑ', 'ӗ', '6', 'һ', 'ա', 'α', '٢', '8', 'آ', 'Б', 'س', '0', '7', 'А', 'ა', '9', '³', 'ҫ', 'ο', 'ý', 'Ш'

In [3]:
SEQ_LEN = 256
BATCH_SIZE = 16

def get_batch(sequence):
    trains = []
    targets = []
    for _ in range(BATCH_SIZE):
        batch_start = np.random.randint(0, len(sequence) - SEQ_LEN)
        chunk = sequence[batch_start: batch_start + SEQ_LEN]
        train = torch.LongTensor(chunk[:-1]).view(-1, 1)
        target = torch.LongTensor(chunk[1:]).view(-1, 1)
        trains.append(train)
        targets.append(target)
    return torch.stack(trains, dim=0), torch.stack(targets, dim=0)

In [4]:
def evaluate(model, char_to_idx, idx_to_char, start_text=' ', prediction_len=200, temp=0.3):
    hidden = model.init_hidden()
    idx_input = [char_to_idx[char] for char in start_text]
    train = torch.LongTensor(idx_input).view(-1, 1, 1).to(device)
    predicted_text = start_text
    
    _, hidden = model(train, hidden)
        
    inp = train[-1].view(-1, 1, 1)
    
    for i in range(prediction_len):
        output, hidden = model(inp.to(device), hidden)
        output_logits = output.cpu().data.view(-1)
        p_next = F.softmax(output_logits / temp, dim=-1).detach().cpu().data.numpy()        
        top_index = np.random.choice(len(char_to_idx), p=p_next)
        inp = torch.LongTensor([top_index]).view(-1, 1, 1).to(device)
        predicted_char = idx_to_char[top_index]
        predicted_text += predicted_char
    
    return predicted_text

In [5]:
class TextRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(TextRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)
        
    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, (ht1, ct1)
    
    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = TextRNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    patience=5, 
    verbose=True, 
    factor=0.5
)

n_epochs = 10000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    loss_avg.append(loss.item())
    if len(loss_avg) >= 50:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

  return torch._C._cuda_getDeviceCount() > 0


Loss: 3.414594569206238
 нала р наш  ланаь налакада нахе икаката  каха каша  лай на нена  а ьалах лид й ни ра  ас г ран нанон ланана нан кака канараб  шан нан налана нк нан  н  й р камарил  хан кала каг к  ка  кана н танинак 
Loss: 2.945315942764282
 сера сара басха лера карка оль хе хор кков рер хела аркма кола мерор карта токар кара кар ктури конка кал хара карт кера талу ралетар кар дона как кал цатар какар катта кара халь ката калра кара кун к
Loss: 2.803053951263428
 декал дакха кехь ха кӏала ора га келора кола лахь дахь къакха кор хар карта кара кӏалела кхакха кхахь кор куль ков къала кала кӏела лера къер калала кӏара къала дула ке лоьр ха когхоль къала карта кха
Loss: 2.721577925682068
 далара къаь махь тӏаьр даьл дала бекхахь хьаькъ къехь мал къахь къархь сар къаьрна къахь къала мал кхьахь махь тера моли малар мар алин дора хола манха бехь мара кханан март мала халан кхалар ха тӏаьл
Loss: 2.6382189893722536
 керт макъолла къоволор киралов карго къара корт къаьна къов морахь къо

In [None]:
with open(corpus_path) as f:
    texts = f.readlines()
    
generated_texts = []
for text in texts:
    splitted_text = text.split()
    generated_text = ''
    for i, word in enumerate(splitted_text):
        if i % 200 == 0:
            generated_text += evaluate(model, char_to_idx, idx_to_char, temp=0.3, prediction_len=200, start_text=word)
        
    generated_texts.append(generated_text)
    
generated_path = 'generated_corpus_ce.txt'
with open(generated_path, 'w') as f:
    for generated_text in generated_texts:
        f.write(generated_text + '\n') 