In [None]:
#!python3 -m spacy download ru_core_news_md

In [1]:
from pathlib import Path
import numpy as np
from abc import ABC, abstractmethod

import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import spacy

  from cryptography import utils, x509


In [2]:
import tqdm.auto as tqd

from ipywidgets import Output
from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt

In [3]:
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

In [4]:
from source.model import RNNModel, PretrainedSpacyEmbedding
from source.generator import ArgmaxGenerator, DistributionGenerator, BeamSearchGenerator
from source.utils import Lang, spacy_lemmatizer
from source.train_utils import SequenceNLLLoss, NewsDataset, padding_collater, split_dataset

In [32]:
!python3 train.py --help

usage: train.py [-h] [--data-path DATA_PATH] [--embedding-dim EMBEDDING_DIM]
                [--hidden-dim HIDDEN_DIM] [--device-name DEVICE_NAME]
                [--n-epochs N_EPOCHS] [--batch-size BATCH_SIZE]
                [--checkpoint-path CHECKPOINT_PATH]

Train RNN model for generation.

optional arguments:
  -h, --help            show this help message and exit
  --data-path DATA_PATH
                        path to file train sentences
  --embedding-dim EMBEDDING_DIM
                        embedding size
  --hidden-dim HIDDEN_DIM
                        hidden state size
  --device-name DEVICE_NAME
                        device name. to use cpu for training change value to
                        any
  --n-epochs N_EPOCHS
  --batch-size BATCH_SIZE
  --checkpoint-path CHECKPOINT_PATH
                        path to save trained model


## Data preparation

In [5]:
news_data_path = "data/headers_full.txt"

with open(news_data_path, 'r') as f:
    news_data_raw = f.read()

news_sentences = news_data_raw.split('. ')

In [6]:
news_sentences[0]

'рпцз призвала вынести ленина из мавзолея и начать декоммунизацию'

#### Spacy lemmatization

In [7]:
nlp = spacy.load("ru_core_news_md")

In [8]:
N_WORDS = 20000
MAX_LENGTH = 30

In [21]:
from collections import defaultdict

frequencies = defaultdict(int)

for sentence in news_sentences:
    doc = nlp(str(sentence))
    for word in doc:
        frequencies[word.lemma_] += 1

print('Had', len(news_sentences), 'sentences')

min_count = sorted(frequencies.values(), key=lambda v: -frequencies[v])[N_WORDS]
print(min_count)

Had 58604 sentences
2


In [30]:
news_sentences_filtered_prep = []
for sentence in news_sentences:
    doc = nlp(str(sentence))
    if len(doc) > MAX_LENGTH:
        continue
    if not all([frequencies[word.lemma_] > min_count for word in doc]):
        continue
    news_sentences_filtered_prep.append(sentence)

print('Filtered', len(news_sentences_filtered_prep), 'sentences')

Filtered 38481 sentences


In [32]:
with open("data/headers_filtered.txt", 'w') as of:
    of.writelines([line + '\n' for line in news_sentences_filtered_prep])

next launches

In [9]:
with open("data/headers_filtered.txt", 'r') as inf:
    news_sentences_filtered = inf.readlines()
    
print(len(news_sentences_filtered))
print(news_sentences_filtered[0])

38481
найдены тела пропавших моряков с американского эсминца



#### BPE

In [10]:
BPE_VOCAB_SIZE = 4000

In [154]:
tokenizer = WordPunctTokenizer()
def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))

In [155]:
# split and tokenize the data
with open('data/train.ru', 'w') as f_dst:
    for line in news_sentences:
        line = line.strip()
        f_dst.write(tokenize(line) + '\n')

# build and apply bpe vocs
learn_bpe(open('data/train.ru'), open('data/bpe_rules.ru', 'w'), num_symbols=BPE_VOCAB_SIZE)
bpe = BPE(open('data/bpe_rules.ru'))

with open('data/train.bpe.ru', 'w') as f_out:
    for line in open('data/train.ru'):
        f_out.write(bpe.process_line(line.strip()) + '\n')

100%|██████████| 4000/4000 [00:07<00:00, 501.56it/s]


next launches

In [10]:
with open('data/train.bpe.ru', 'r') as f_in:
    news_sentences_bpe = f_in.readlines()
    
print(len(news_sentences_bpe))
print(news_sentences_bpe[0])

58604
рп@@ ц@@ з призвала вы@@ не@@ сти лен@@ ина из м@@ ав@@ зо@@ ле@@ я и нач@@ ать де@@ коммуни@@ зацию



In [11]:
# spacy lemmas lang (word-level)
lang = Lang(tokenizer=lambda x: nlp(x.strip()), lemmatizer=lambda word: word.text)
lang.addDocument(news_sentences_filtered)
lang.getStat()

Vocab size 38381
Most frequent [(19024, 'в'), (7703, 'на'), (4399, 'о'), (4262, 'с'), (3567, 'за'), (3481, '-'), (3192, 'и'), (2837, 'из'), (2333, 'по'), (2333, 'россии')]
Less frequent [(1, 'газзаева'), (1, 'зеркало'), (1, 'разделили'), (1, 'свердловскую'), (1, 'осу'), (1, 'двойню'), (1, 'пограничных'), (1, 'корпусу'), (1, 'боксерском'), (1, 'травм')]


In [12]:
# char lang (char-level)
char_lang = Lang(tokenizer=lambda x: x, lemmatizer=lambda x: x)
char_lang.addDocument(news_sentences)
char_lang.getStat()

Vocab size 79
Most frequent [(402838, ' '), (285032, 'о'), (259369, 'а'), (255837, 'и'), (210645, 'е'), (177186, 'р'), (173345, 'с'), (172203, 'н'), (147323, 'в'), (146285, 'т')]
Less frequent [(90, 'q'), (100, '?'), (124, ':'), (153, 'j'), (193, '.'), (196, '!'), (246, 'z'), (392, 'x'), (516, 'v'), (630, '9')]


In [13]:
# BPE lang (char-comb-level)
bpe_lang = Lang(tokenizer=lambda x: x.split(), lemmatizer=lambda x: x)
bpe_lang.addDocument(news_sentences_bpe)
bpe_lang.getStat()

Vocab size 4118
Most frequent [(28281, 'в'), (12327, 'на'), (8125, 'с'), (7772, '-'), (7667, 'о'), (7200, 'и'), (5832, 'за'), (4401, 'из'), (4067, 'у@@'), (3937, 'е')]
Less frequent [(1, 'goo@@'), (1, 'ook'), (1, 'ссажи@@'), (1, 'goog@@'), (1, 'суэ@@'), (1, ',@@'), (1, 'товал@@'), (1, 'илот@@'), (1, 'паци@@'), (1, 'ъяв@@')]


## Train

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from train import train

In [21]:
# empty embeddings, short dataset
from source.utils import PAD_TOKEN_INDEX

n_epochs = 2
batch_size = 32
learning_rate = 1e-2

emb_dim = 128
NUM_EMBEDDINGS = len(lang.index2word)
empty_emb_layer = nn.Embedding(num_embeddings=NUM_EMBEDDINGS,
                               embedding_dim=emb_dim)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = RNNModel(empty_emb_layer,
                 hidden_dim=128).to(device)

gens = [ArgmaxGenerator(lang.index2word), 
        DistributionGenerator(lang.index2word, k_max=30),
        BeamSearchGenerator(lang.index2word, max_length=10, beam_width=3)]

loss = SequenceNLLLoss(ignore_index=PAD_TOKEN_INDEX)
opt = optim.Adam(model.parameters(), lr=learning_rate)

short_word_dataset = NewsDataset(
    sentences=news_sentences_filtered[:5000],
    tokenizer=lang.tokenizer,
    lemmatizer=lang.lemmatizer,
    word2index=lang.word2index
)
short_word_dataset_train, short_word_dataset_val = split_dataset(short_word_dataset)

train_dataloader = DataLoader(short_word_dataset_train, batch_size=batch_size, shuffle=True, 
                              collate_fn=padding_collater)
val_dataloader = DataLoader(short_word_dataset_val, batch_size=batch_size, shuffle=False, 
                            collate_fn=padding_collater)

In [22]:
train(model, device, gens, loss, opt, train_dataloader, val_dataloader, n_epochs, visualize=True)

Output()

Epoch 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [24]:
# empty embeddings, full dataset

n_epochs = 2
batch_size = 32
learning_rate = 1e-2

emb_dim = 64
NUM_EMBEDDINGS = len(lang.index2word)
empty_emb_layer = nn.Embedding(num_embeddings=NUM_EMBEDDINGS,
                               embedding_dim=emb_dim)
model = RNNModel(empty_emb_layer,
                 hidden_dim=64).to(device)

gens = [ArgmaxGenerator(lang.index2word, max_length=30), 
        DistributionGenerator(lang.index2word, k_max=30, max_length=30)]

loss = SequenceNLLLoss(ignore_index=PAD_TOKEN_INDEX)
opt = optim.Adam(model.parameters(), lr=learning_rate)

word_dataset = NewsDataset(
    sentences=news_sentences_filtered,
    tokenizer=lang.tokenizer,
    lemmatizer=lang.lemmatizer,
    word2index=lang.word2index
)
word_dataset_train, word_dataset_val = split_dataset(word_dataset)

train_dataloader = DataLoader(word_dataset_train, batch_size=batch_size, shuffle=True, 
                              collate_fn=padding_collater)
val_dataloader = DataLoader(word_dataset_val, batch_size=batch_size, shuffle=False, 
                            collate_fn=padding_collater)

In [25]:
train(model, device, gens, loss, opt, train_dataloader, val_dataloader, n_epochs, visualize=True)

Output()

Epoch 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [29]:
# spacy embeddings fixed, full dataset

n_epochs = 2
val_freq = 1
batch_size = 32
learning_rate = 1e-2


spacy_emb_layer = PretrainedSpacyEmbedding(nlp, lang.index2word)
model = RNNModel(spacy_emb_layer, 
                 hidden_dim=64).to(device)

gens = [ArgmaxGenerator(lang.index2word), 
        DistributionGenerator(lang.index2word, k_max=30),
        BeamSearchGenerator(lang.index2word, beam_width=2)]

loss = SequenceNLLLoss(ignore_index=PAD_TOKEN_INDEX)
opt = optim.Adam(model.parameters(), lr=learning_rate)

word_dataset = NewsDataset(
    sentences=news_sentences_filtered,
    tokenizer=lang.tokenizer,
    lemmatizer=lambda word: word.text,
    word2index=lang.word2index
)
word_dataset_train, word_dataset_val = split_dataset(word_dataset)

train_dataloader = DataLoader(word_dataset_train, batch_size=batch_size, shuffle=True, 
                              collate_fn=padding_collater)
val_dataloader = DataLoader(word_dataset_val, batch_size=batch_size, shuffle=False, 
                            collate_fn=padding_collater)

In [30]:
train(model, device, gens, loss, opt, train_dataloader, val_dataloader, n_epochs, visualize=True)

Output()

Epoch 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [184]:
gen = DistributionGenerator(index2word=lang.index2word, k_max=20, 
                            max_length=30)
for i in range(10):
    print(gen.generate(model, device))

путин рассказал от возможности с на
суд львова рассказала в планах к сша в с - лет
в - рассказали о отсутствии с в в россии и сирии
в сети сообщили подробности смерти подготовке газа
на германии рассказали о освобождении в россии
в киеве сообщили в падении гибели двух москве
в сети возмутились подорожание и - за - на -
сми военные сообщили о планах с трампа
в сети обратили сроки с новой в россией в россией
в россии учредили в сша


In [None]:
# char

n_epochs = 10
val_freq = 1
batch_size = 32
learning_rate = 1e-2

emb_dim = 256
NUM_EMBEDDINGS = len(char_lang.index2word)
empty_emb_layer = nn.Embedding(num_embeddings=NUM_EMBEDDINGS,
                               embedding_dim=emb_dim)
model = RNNModel(spacy_emb_layer, 
                 hidden_dim=512).to(device)

gens = [ArgmaxGenerator(char_lang.index2word), 
        DistributionGenerator(char_lang.index2word, k_max=30),
        BeamSearchGenerator(char_lang.index2word, beam_width=2)]

loss = SequenceNLLLoss(ignore_index=PAD_TOKEN_INDEX)
opt = optim.Adam(model.parameters(), lr=learning_rate)

char_dataset = NewsDataset(
    sentences=news_sentences_filtered,
    tokenizer=char_lang.tokenizer,
    lemmatizer=char_lang.lemmatizer,
    word2index=char_lang.word2index
)
char_dataset_train, char_dataset_val = split_dataset(char_dataset)

train_dataloader = DataLoader(char_dataset_train, batch_size=batch_size, shuffle=True, 
                              collate_fn=padding_collater)
val_dataloader = DataLoader(char_dataset_val, batch_size=batch_size, shuffle=False, 
                            collate_fn=padding_collater)

In [12]:
# bpe
from source.utils import PAD_TOKEN_INDEX

val_freq = 1
batch_size = 32

emb_dim = 256
hid_dim = 1024
NUM_EMBEDDINGS = len(bpe_lang.index2word)
empty_emb_layer = nn.Embedding(num_embeddings=NUM_EMBEDDINGS,
                               embedding_dim=emb_dim)
device = torch.device('cuda')
model = RNNModel(empty_emb_layer, 
                 hidden_dim=hid_dim).to(device)

gens = [ArgmaxGenerator(bpe_lang.index2word, max_length=100), 
        DistributionGenerator(bpe_lang.index2word, max_length=100, k_max=20)]

loss = SequenceNLLLoss(ignore_index=PAD_TOKEN_INDEX)

bpe_dataset = NewsDataset(
    sentences=news_sentences_bpe,
    tokenizer=bpe_lang.tokenizer,
    lemmatizer=bpe_lang.lemmatizer,
    word2index=bpe_lang.word2index
)
bpe_dataset_train, bpe_dataset_val = split_dataset(bpe_dataset)

train_dataloader = DataLoader(bpe_dataset_train, batch_size=batch_size, shuffle=True, 
                              collate_fn=padding_collater)
val_dataloader = DataLoader(bpe_dataset_val, batch_size=batch_size, shuffle=False, 
                            collate_fn=padding_collater)

In [13]:
opt = optim.Adam(model.parameters(), lr=1e-3)
train(model, gens, loss, opt, train_dataloader, val_dataloader, 3)

Output()

Epoch 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 3


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [213]:
opt = optim.Adam(model.parameters(), lr=1e-4)
train(model, gens, loss, opt, train_dataloader, val_dataloader, 1)

Output()

Epoch 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [250]:
gen = DistributionGenerator(bpe_lang.index2word, max_length=150, k_max=20)
for i in range(1):
    print(gen.generate(model, device))

в москве задержали 286 - го полицейских за несколько недель


In [251]:
model

RNNModel(
  (embedding): Embedding(4118, 256)
  (lstm): LSTM(256, 1024, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (index_projection): Linear(in_features=1024, out_features=4118, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [17]:
path = 'bpe_4000_1024'

In [253]:
torch.save(model.state_dict(), path)

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

emb_dim = 256
hid_dim = 1024
NUM_EMBEDDINGS = len(bpe_lang.index2word)
empty_emb_layer = nn.Embedding(num_embeddings=NUM_EMBEDDINGS,
                               embedding_dim=emb_dim)
model = RNNModel(empty_emb_layer, NUM_EMBEDDINGS, 
                 embedding_dim=emb_dim,
                 hidden_dim=hid_dim).to(device)

In [20]:
model.load_state_dict(torch.load(path))
model.eval()

RNNModel(
  (embedding): Embedding(4118, 256)
  (lstm): LSTM(256, 1024, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (index_projection): Linear(in_features=1024, out_features=4118, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [21]:
gen = DistributionGenerator(bpe_lang.index2word, max_length=150, k_max=20)

In [22]:
gen.generate(model, device)

'в совфеде оценили шансы на идею путина и трампа про борьбу с россией'