<a href="https://colab.research.google.com/github/DmitriyValetov/nlp_course_project/blob/master/ria_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [0]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
# Download a file based on its file ID.
# 1-UtATnzLE809Vi6RLgy3GRHX2TXRzhd6  # norm
# 1bhsdkXYEe4qixPddK9DkaQ-7z0jAn5Bi  # stop
# 1-40QXRckYZIfJTPiAhtQ8LAeBKks8ITx  # lem
# 1m7unmZmh0B3DJ-hiLm8-a5WoXECMc4cV  # snow
file_id = '1-40QXRckYZIfJTPiAhtQ8LAeBKks8ITx'
downloaded = drive.CreateFile({'id': file_id})
downloaded.FetchMetadata()
fn = downloaded.metadata['title']
print(f'downloading: {fn}')
downloaded.GetContentFile(fn)

downloading: stop_lem_norm_sents_ria.json.gz


In [0]:
import gzip
import json
from tqdm.notebook import tqdm

fn = 'stop_lem_norm_sents_ria.json.gz'
# fn = 'norm_sents_ria.json.gz'

# first check
with gzip.open(fn, 'rb') as f:
  print(json.loads(next(f)))

# all check
# with gzip.open(fn, 'rb') as f:
#   for line in tqdm(f):
#     json.loads(line)

# all check
cnt = 0
bad_cnt = 0
with gzip.open(fn, 'rb') as f:
  for line in tqdm(f):
    cnt += 1
    n = json.loads(line)
    if len(n['text']) == 0 or len(n['title']) == 0:
      bad_cnt += 1
      print(cnt)
      print(n)
print(f'{bad_cnt}/{cnt}')

In [0]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import os
import json
import gzip
from pprint import pprint
from collections import Counter
from tqdm.notebook import tqdm
import numpy as np

class RiaDataset(Dataset):
  def __init__(self, path, max_vocab=10000, max_samples=500):
    super(RiaDataset).__init__()
    samples = []
    words = Counter()
    cnt, bad_ids = 0, []
    _, ext = os.path.splitext(path)
    with gzip.open(path, 'rb') if ext == '.gz' else open(path) as f:
      for _ in tqdm(range(max_samples), desc='loading samples'):
        n = json.loads(next(f))
        if not len(n['text']) == 0 and not len(n['title']) == 0:        
          samples.append(n)
          words.update(w for s in n['text'] for w in s.split())
          words.update(w for s in n['title'] for w in s.split())
        else:
          bad_ids.append(cnt)
        cnt += 1
    print(f'bad texts: {len(bad_ids)} {bad_ids}')
    print('making vocabulary')
    print(f'unique words: {len(words)}, total words: {sum(words.values())}')
    self.itos = ['<pad>', '<unk>', '<go>', '<eos>']
    vocab_words = words.most_common(max_vocab)
    self.itos += [w for w, _ in vocab_words]
    self.stoi = {x: i for i, x in enumerate(self.itos)}
    assert len(self.itos) == len(self.stoi)
    print(f'vocabulary: {len(self.stoi)}, words in vocabulary: {sum(c for _, c in vocab_words)}')
    print('encoding samples')
    self.samples = []
    for s in tqdm(samples, desc='encoding samples'):
      x1 = [self.encode(x) for x in s['text']]
      x2 = [self.encode(x) for x in s['title']]
      self.samples.append([x1, x2])
    print(f'samples: {len(self.samples)}')

  def __len__(self):
    return len(self.samples)

  def __getitem__(self, i):
    return self.samples[i]

  def encode(self, s):
    return np.array([2] + [self.stoi.get(x, 1) for x in s.split()] + [3])

  def decode(self, s):
    return [self.itos[x] for x in s]

class Collate():
  def __init__(self, n_x1=1):
     self.n_x1 = n_x1

  def __call__(self, batch):
    bx1, bx2 = [], []
    for b in batch:
      x1s, x2s = b
      # skip first sentence (usually date and place)
      ei = min(len(x1s), 1 + self.n_x1)
      si = 1 if ei > 1 else 0
      # print(self.n_x1, si, ei, len(x1s))
      for x1 in x1s[si:ei]:
        for x2 in x2s:
          bx1.append(torch.as_tensor(x1))
          bx2.append(torch.as_tensor(x2))
    bx1 = torch.nn.utils.rnn.pad_sequence(bx1, batch_first=True)
    bx2 = torch.nn.utils.rnn.pad_sequence(bx2, batch_first=True)
    batch = [bx1, bx2]
    return batch

ds = RiaDataset('stop_lem_norm_sents_ria.json.gz', max_vocab=50000, 
                max_samples=1003869)  # 1003869
# torch.manual_seed(0)
# torch.cuda.manual_seed(0)
# dl = DataLoader(ds, batch_size=1, num_workers=1, shuffle=False, 
#                 drop_last=False, collate_fn=Collate())
# for s in tqdm(dl, desc='dataset'):
#   bx1, bx2 = s
#   assert bx1.size()[0] == bx2.size()[0]
#   # print(bx1.size(), bx2.size())
#   # for x1, x2 in zip(bx1, bx2):
#   #   print(ds.decode(x1), ds.decode(x2))
#   # print([ds.decode(x) for x in bx1])
#   # print([ds.decode(x) for x in bx2])
train_len = int(0.7*len(ds))
test_len = int(0.2*len(ds))
val_len = len(ds) - train_len - test_len
lens = [train_len, test_len, val_len]
print(lens, sum(lens), len(ds))
train_ds, test_ds, val_ds = random_split(ds, lens)
train_dl = DataLoader(train_ds, batch_size=2, num_workers=1, 
                      shuffle=True, drop_last=False, 
                      collate_fn=Collate(3))
test_dl = DataLoader(test_ds, batch_size=3, num_workers=1, 
                     shuffle=True, drop_last=False, 
                     collate_fn=Collate(4))
val_dl = DataLoader(val_ds, batch_size=3, num_workers=1, 
                    shuffle=True, drop_last=False, 
                    collate_fn=Collate(2))
# for s in tqdm(train_dl, desc='train'):
#   bx1, bx2 = s
#   assert bx1.size()[0] == bx2.size()[0]
#   # print(bx1.size(), bx2.size())
# for s in tqdm(test_dl, desc='test'):
#   bx1, bx2 = s
#   assert bx1.size()[0] == bx2.size()[0]
#   # print(bx1.size(), bx2.size())
# for s in tqdm(val_dl, desc='val'):
#   bx1, bx2 = s
#   assert bx1.size()[0] == bx2.size()[0]
#   # print(bx1.size(), bx2.size())

HBox(children=(FloatProgress(value=0.0, description='loading samples', max=1003869.0, style=ProgressStyle(desc…


bad texts: 146 [85124, 121073, 167715, 176024, 191679, 196491, 233761, 240737, 248487, 260013, 285129, 292992, 293089, 293116, 301604, 305587, 312586, 314213, 316134, 317722, 318000, 318655, 325713, 362998, 366397, 382262, 397968, 400780, 406416, 411736, 431649, 437910, 453488, 463202, 463257, 467341, 471837, 477691, 480756, 483624, 487196, 493816, 495050, 495318, 495338, 495705, 495898, 498775, 499475, 501329, 503532, 503880, 504118, 504486, 504600, 508333, 510257, 517735, 518394, 519404, 519432, 519506, 520672, 521438, 521730, 524884, 525492, 526160, 526173, 527055, 527102, 527562, 529467, 529469, 530575, 531190, 532894, 533960, 533982, 537247, 540254, 540565, 542038, 542044, 542769, 544866, 545213, 550244, 552215, 557118, 570332, 571466, 572783, 572920, 573314, 574084, 575678, 575691, 583560, 612454, 666619, 680547, 681071, 690771, 693126, 778882, 781772, 794032, 795809, 804124, 819613, 822100, 858471, 868287, 906565, 908647, 909081, 909141, 911907, 912070, 914943, 915590, 916612, 

HBox(children=(FloatProgress(value=0.0, description='encoding samples', max=1003723.0, style=ProgressStyle(des…


samples: 1003723
[702606, 200744, 100373] 1003723 1003723


In [0]:
import gc
gc.collect()

3011173

# Model

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class EncoderRNN(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, padding_idx,
               rnn_type, hidden_size, num_layers=1, rnn_dropout=0,
               bidirectional=False, dropout=0, pack=False):
    super(EncoderRNN, self).__init__()
    rnn_map = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}
    self.embedding = nn.Embedding(num_embeddings=num_embeddings, 
                                  embedding_dim=embedding_dim,
                                  padding_idx=padding_idx)
    self.rnn = rnn_map[rnn_type](input_size=embedding_dim, 
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 batch_first=True,
                                 dropout=rnn_dropout,
                                 bidirectional=bidirectional)
    self.dropout = nn.Dropout(dropout)
    self.pack = pack

  def forward(self, x1):  # [B, L]
    x = self.embedding(x1)  # [B, L] -> [B, L, E]
    if self.pack:  # [B, L, E] -> Packed [B, L, E]
      ls = torch.sum(x1 != 0, dim=1)
      x = pack_padded_sequence(x, ls, batch_first=True, enforce_sorted=False)
    # [B, L, E] -> (Packed) [B, L, ND*H], ([NL*ND, B, H], ([NL*ND, B, H]))
    if isinstance(self.rnn, nn.LSTM):
      ht, (hn, cn) = self.rnn(x)
      hn, cn = self.dropout(hn), self.dropout(cn)
      return ht, (hn, cn)
    else:
      ht, hn = self.rnn(x)
      hn = self.dropout(hn)
      return ht, hn


class AttentionDecoderRNN(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, padding_idx,
               rnn_type, hidden_size, num_layers=1, rnn_dropout=0,
               bidirectional=False, dropout=0, out_hidden=0,
               attn_type='soft_dot', pack=False):
    super(AttentionDecoderRNN, self).__init__()
    rnn_map = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}
    attn_types = ['dot', 'cos', 'dist', 'soft_dot', 'soft_cos', 'soft_dist', 'none']
    self.attn_type = attn_type
    self.embedding = nn.Embedding(num_embeddings=num_embeddings, 
                                  embedding_dim=embedding_dim,
                                  padding_idx=padding_idx)
    self.rnn = rnn_map[rnn_type](input_size=embedding_dim, 
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 batch_first=True,
                                 dropout=rnn_dropout,
                                 bidirectional=bidirectional)
    out_input = hidden_size if self.attn_type == 'none' else 2*hidden_size
    if out_hidden > 0:
      self.out_hidden = nn.Linear(out_input, out_hidden)
      self.out = nn.Linear(out_hidden, num_embeddings)
    else:
      self.out_hidden = None
      self.out = nn.Linear(out_input, num_embeddings)
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.LogSoftmax(dim=2)
    self.pack = pack

  def forward(self, x2, h1, x1):  # [B, L], [B, L, ND*H], ([NL*ND, B, H], [NL*ND, B, H])
    x = self.embedding(x2)  # [B, L] -> [B, L, E]
    if self.pack:  # [B, L, E] -> Packed [B, L, E]
      lengths = torch.sum(x2 != 0, dim=1)
      x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
    ht1, hns1 = h1  # [B, L, ND*H], ([NL*ND, B, H], [NL*ND, B, H])
    # [B, L, D], ([NL*ND, B, H], [NL*ND, B, H]) -> [B, L, ND*H], ([NL*ND, B, H], [NL*ND, B, H])
    ht2, hns2 = self.rnn(x, hns1)
    if self.pack:  # Packed [B, L, E] -> [B, L, E]
      ht1, _ = pad_packed_sequence(ht1, batch_first=True)
      ht2, _ = pad_packed_sequence(ht2, batch_first=True)
    if self.attn_type == 'none':
      if self.rnn.bidirectional:
        B2, L2, NDH2 = ht2.size()
        # [B, L, ND*H] -> [B, L, ND, H] -> [B, L, H]
        ht2 = ht2.view(B2, L2, 2, int(NDH2/2)).mean(2)
      x = self.dropout(ht2)
      if self.out_hidden is not None:
        x = self.out_hidden(x)
      x = self.out(x)
      y2 = self.softmax(x)
      return y2, None
    else:
      if self.rnn.bidirectional:
        B1, L1, NDH1 = ht1.size()
        B2, L2, NDH2 = ht2.size()  # B1 == B2, NDH1 == NDH2
        # [B, L, ND*H] -> [B, L, ND, H] -> [B, L, H]
        ht1 = ht1.view(B1, L1, 2, int(NDH1/2)).mean(2)
        ht2 = ht2.view(B2, L2, 2, int(NDH2/2)).mean(2)
      # [L2, H], [L1, H] -> [L2, L1]
      # mask where x1 and x2 token is <PAD>
      # pad_mask = torch.einsum('bi,bj->bij', x2, x1) == 0
      # mask where only x1 token is <PAD>
      pad_mask = torch.einsum('bi,bj->bij', torch.ones_like(x2), x1) == 0
      # [B, L2, H], [B, L1, H] -> [B, L2, L1]  # attention
      # [B, L2, L1], [B, L1, H] -> [B, L2, H]  # weighted h1
      if self.attn_type == 'dot':
        attn = torch.einsum('bih,bjh->bij', ht2, ht1)  # dot product
        attn[pad_mask] = 0
      elif self.attn_type == 'soft_dot':
        attn = torch.einsum('bih,bjh->bij', ht2, ht1)  # dot product
        attn[pad_mask] = float('-inf')
        attn = F.softmax(attn, 2)
      elif self.attn_type == 'dist':
        ht1, ht2 = ht1.contiguous(), ht2.contiguous()
        attn = torch.cdist(ht2, ht1)  # euclidian distance
        attn = torch.masked_fill(attn, pad_mask, float('inf'))
        attn = F.threshold(attn, threshold=1e-6, value=1e-6)  # short dist
        attn = 1/attn  # inverse
        attn = F.normalize(attn, p=1, dim=2)  # to [0, 1]
      elif self.attn_type == 'soft_dist':
        ht1, ht2 = ht1.contiguous(), ht2.contiguous()
        attn = torch.cdist(ht2, ht1)  # euclidian distance
        attn = torch.masked_fill(attn, pad_mask, float('inf'))
        attn = F.softmin(attn, 2)
      elif self.attn_type == 'cos':
        ht1n = F.normalize(ht1, p=2, dim=2)  # normalize to length = 1
        ht2n = F.normalize(ht2, p=2, dim=2)  # normalize to length = 1
        attn = torch.einsum('bih,bjh->bij', ht2n, ht1n)  # dot product
        attn[pad_mask] = 0
      elif self.attn_type == 'soft_cos':
        ht1n = F.normalize(ht1, p=2, dim=2)  # normalize to length = 1
        ht2n = F.normalize(ht2, p=2, dim=2)  # normalize to length = 1
        attn = torch.einsum('bih,bjh->bij', ht2n, ht1n)  # dot product
        attn[pad_mask] = float('-inf')
        attn = F.softmax(attn, 2)
      hw1 = torch.einsum('bij,bjh->bih', attn, ht1)  # weighted h1
      ha = torch.cat((ht2, hw1), 2)  # [B, L2, H], [B, L2, H] -> [B, L2, H+H]
      x = self.dropout(ha)  # [B, L2, H+H] -> [B, L2, H+H]
      if self.out_hidden is not None:
        x = self.out_hidden(x)   # [B, L2, H+H] -> [B, L2, OH]
      x = self.out(x)  # [B, L2, H+H] or [B, L2, OH] -> [B, L2, D2]
      y2 = self.softmax(x)  # [B, L2, D2] -> [B, L2, D2]
      return y2, attn


class EncoderDecoder(nn.Module):
  def __init__(self, 
               enc_num_embeddings, enc_embedding_dim, enc_padding_idx,
               dec_num_embeddings, dec_embedding_dim, dec_padding_idx,
               rnn_type, hidden_size, num_layers=1, out_hidden=0,
               enc_rnn_dropout=0, dec_rnn_dropout=0,
               bidirectional=False, enc_dropout=0, dec_dropout=0, 
               attn_type='dot', pack=False):
    super(EncoderDecoder, self).__init__()
    self.encoder = EncoderRNN(num_embeddings=enc_num_embeddings, 
                              embedding_dim=enc_embedding_dim, 
                              padding_idx=enc_padding_idx,
                              hidden_size=hidden_size, 
                              rnn_type=rnn_type, 
                              bidirectional=bidirectional,
                              num_layers=num_layers,
                              dropout=enc_dropout,
                              rnn_dropout=enc_rnn_dropout,
                              pack=pack)
    self.decoder = AttentionDecoderRNN(num_embeddings=dec_num_embeddings, 
                                       embedding_dim=dec_embedding_dim, 
                                       padding_idx=dec_padding_idx,
                                       hidden_size=hidden_size, 
                                       rnn_type=rnn_type, 
                                       bidirectional=bidirectional,
                                       num_layers=num_layers,
                                       dropout=dec_dropout,
                                       out_hidden=out_hidden,
                                       rnn_dropout=dec_rnn_dropout,
                                       attn_type=attn_type,
                                       pack=pack)

  def forward(self, x1, x2):
    #  [B, L1] -> [B, L1, ND*H], ([NL*ND, B, H], [NL*ND, B, H])
    h1 = self.encoder(x1)
    # [B, L2], ([B, L1, ND*H], ([NL*ND, B, H], [NL*ND, B, H])) -> [B, L2, E2]
    y2 = self.decoder(x2, h1, x1)
    return y2


def external_attn(x1, x2, attn_dict):
    attn_dict = {1: [1], 2: [2], 4: [4, 3]}
    attn = []
    B1, L1 = x1.size()
    B2, L2 = x2.size()
    for i in range(B1):
      b = []
      for j in range(L2):
        l2 = []
        # x2t = x2[i, j].item()
        x2t = x2[i, j + 1].item() if j + 2 < L2 else 0 # decoder shift
        for k in range(L1):
          x1t = x1[i, k].item()
          # x1t = x1[i, k + 1].item() if k + 1 < L1 else 0 # encoder shift
          x1ts = self.attn_dict.get(x2t, [])
          if x1t in x1ts:
            l2.append(1.)
          else:
            l2.append(0.)
        b.append(l2)
      attn.append(b)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    attn = torch.tensor(attn).to(device)
    attn = F.normalize(attn, p=1, dim=2)  # to [0, 1]
    return attn


from IPython.display import Image
# Image(make_dot(loss).render('loss', format='png'))
from tqdm.notebook import tqdm
from torch import optim
import numpy as np

def plot_attention(a, x1, x2, shift=True, mask=True, 
                   suptitle='Attention', figsize=None, 
                   decoder_x1=None, decoder_x2=None, tight=False,
                   labels=True):
  # %matplotlib inline
  import matplotlib.pyplot as plt
  from mpl_toolkits.axes_grid1 import make_axes_locatable

  b = a.shape[0]
  fig, axs = plt.subplots(1, b, figsize=figsize)
  if not isinstance(axs, np.ndarray):  # if batch size == 1
    axs = [axs]
  fig.suptitle(suptitle)
  for i in range(b):
    ax = axs[i]
    if shift:
      ba, bx1, bx2 = a[i,:-1,1:], x1[i,1:], x2[i,1:]
    else:
      ba, bx1, bx2 = a[i], x1[i], x2[i]
    if mask:
      mask_x1 = np.flatnonzero(bx1)
      mask_x2 = np.flatnonzero(bx2)
      ba, bx1, bx2 =  ba[mask_x2,:][:,mask_x1], bx1[mask_x1], bx2[mask_x2]
    # ax.set_title(f'{i+1}', y=-0.2)
    im = ax.imshow(ba, cmap='gray')
    ax.set_xticks(np.arange(len(bx1)))
    ax.set_yticks(np.arange(len(bx2)))
    if labels:
      ax.set_xlabel('x1')
      ax.set_ylabel('x2')
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    if decoder_x1 is not None:
      bx1 = decoder_x1(bx1)
    if decoder_x2 is not None:
      bx2 = decoder_x2(bx2)
    ax.set_xticklabels(bx1, rotation=90)  # rotation=90
    ax.set_yticklabels(bx2)
    fig.colorbar(im, ax=ax, fraction=0.05, pad=0.05)
  if tight:
    plt.tight_layout()

# torch.backends.cudnn.enabled=False
# torch.backends.cudnn.deterministic=True
# torch.autograd.set_detect_anomaly(True)

# enc_dec_config = {
#   'enc_num_embeddings': 6,
#   'enc_embedding_dim': 2,
#   'enc_padding_idx': 0,
#   'dec_num_embeddings': 6,
#   'dec_embedding_dim': 2,
#   'dec_padding_idx': 0,
#   'rnn_type': 'RNN',
#   'hidden_size': 2,
#   'num_layers': 1,
#   'bidirectional': False,
#   'enc_rnn_dropout': 0,
#   'dec_rnn_dropout': 0,
#   'enc_dropout': 0,
#   'dec_dropout': 0,
#   'attn_type': 'soft_dist',
#   'out_hidden': 16,
#   'pack': True
# }
# # <pad>, <unk>, <go>, <eos>
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# seed = 0
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# model = EncoderDecoder(**enc_dec_config).to(device)
# x1 = torch.tensor([[2, 1, 4, 3, 0], [2, 1, 4, 1, 3]]).to(device)
# x2 = torch.tensor([[2, 4, 1, 5, 3, 0], [2, 4, 5, 1, 1, 3]]).to(device)
# opt = optim.Rprop(model.parameters(), lr=0.01)
# loss_fn = torch.nn.NLLLoss(ignore_index=0)  # 0 is <PAD>
# pbar = tqdm(range(200))
# for i in pbar:
#   opt.zero_grad()
#   t = x2[:,1:].flatten(start_dim=0)
#   y2, attn = model(x1, x2)
#   p = y2[:,:-1,:].flatten(start_dim=0, end_dim=1)
#   loss = loss_fn(p, t)
#   loss.backward()
#   opt.step()
#   with torch.no_grad():
#     idx = torch.nonzero(t).view(-1)
#     acc = torch.sum(torch.argmax(p, 1)[idx] == t[idx]).float()/idx.size()[0]
#     ps = torch.argmax(y2[:,:-1,:], 2).detach().cpu().numpy()
#     ts = x2[:,1:].detach().cpu().numpy()
#     for sp, st in zip(ps, ts):
#       t_set = set(st) - {0, 1, 2, 3}
#       eos = sp[sp == 3][0] if 3 in sp else len(sp)
#       p_eos = sp[:eos]
#       p_set = set(p_eos) - {0, 1, 2, 3}
#       i_set = t_set.intersection(p_set)
#       pre = len(i_set)/len(p_set) if len(p_set) != 0 else 0
#       rec = len(i_set)/len(t_set) if len(t_set) != 0 else 0
#       f1 = 2*pre*rec/(pre + rec) if pre + rec != 0 else 0
#       # print(st, sp, p_eos)
#       # print(t_set, p_set)
#       # print(f'precision: {pre}, recall: {rec}, f1: {f1}')
#     if i % 25 == 0:
#       # print(attn)
#       # print(p)
#       # print(torch.argmax(p, 1))
#       # print(t)
#       if attn is not None:
#          plot_attention(attn.detach().cpu().numpy(), 
#                         x1.detach().cpu().numpy(), 
#                         x2.detach().cpu().numpy(),
#                         labels=True, tight=False,
#                         shift=False, mask=False, figsize=(10, 5))
#   pbar.set_description(f'loss: {loss:.3f}, acc: {acc:.3f}')
#   if acc == 1:
#     break
# # Image(make_dot(attn).render('attn', format='png'))

# Train

In [0]:
!pip install optuna
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
import subprocess
print(subprocess.getstatusoutput('nvidia-smi')[1])

Fri May 15 23:34:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    31W / 250W |    715MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
!pip install gputil
!pip install psutil
!pip install humanize



In [0]:
# Import packages
import os, sys, humanize, psutil, GPUtil

# Define function
def mem_report():
  print("CPU RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available ))
  
  GPUs = GPUtil.getGPUs()
  for i, gpu in enumerate(GPUs):
    print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    
# Execute function
mem_report()

CPU RAM Free: 19.5 GB
GPU 0 ... Mem Free: 15565MB / 16280MB | Utilization   4%


In [0]:
import torch
import optuna
from tqdm.notebook import tqdm
from pprint import pprint
import random
import numpy as np
from shutil import copyfile

def run(run_type, device, model, opt, loss_fn, dl, ds, print_triples=False, 
        plot_attn=False, inference='forced'):
  losses, accs, pres, recs, f1s = [], [], [], [], []
  pbar = tqdm(dl)
  if run_type == 'train':
    model.train()
    for x1, x2 in pbar:
      opt.zero_grad()
      x1 = x1.to(device)
      x2 = x2.to(device)
      t = x2[:,1:].flatten(0)
      y2, attn = model(x1, x2)
      p = y2[:,:-1,:].flatten(0, 1)
      loss = loss_fn(p, t)
      loss.backward()
      opt.step()
      losses.append(loss.item())
      idx = torch.nonzero(t).view(-1)
      acc = torch.sum(torch.argmax(p, 1)[idx] == t[idx]).float()/idx.size()[0]
      accs.append(acc.item())
      ps = torch.argmax(y2[:,:-1,:], 2).detach().cpu().numpy()
      ts = x2[:,1:].detach().cpu().numpy()
      for sp, st in zip(ps, ts):
        t_set = set(st) - {0, 1, 2, 3}
        eos = sp[sp == 3][0] if 3 in sp else len(sp)
        p_eos = sp[:eos]
        p_set = set(p_eos) - {0, 1, 2, 3}
        i_set = t_set.intersection(p_set)
        pre = len(i_set)/len(p_set) if len(p_set) != 0 else 0
        rec = len(i_set)/len(t_set) if len(t_set) != 0 else 0
        f1 = 2*pre*rec/(pre + rec) if pre + rec != 0 else 0
        # print(st, sp, p_eos)
        # print(t_set, p_set)
        # print(f'precision: {pre}, recall: {rec}, f1: {f1}')
        pres.append(pre)
        recs.append(rec)
        f1s.append(f1)
      pbar.set_description(f'{run_type} l: {loss.item():.2f}, a: {acc:.2f}, p: {pre:.2f}, r: {rec:.2f}, f1: {f1:.2f}')
  else:
    model.eval()
    with torch.no_grad():
      for x1, x2 in pbar:
        x1 = x1.to(device)
        x2 = x2.to(device)
        t = x2[:,1:].flatten(0)
        if inference == 'forced':
          y2, attn = model(x1, x2)
        elif inference == 'greedy':
          # max_len = 30
          cx2 = x2[:,:1]
          for i in range(x2.size()[1]):
            y2, attn = model(x1, cx2)
            cy2 = y2[:,i,:]
            nx2 = torch.argmax(cy2, 1)
            nx2 = nx2.unsqueeze(1)
            cx2 = torch.cat((cx2, nx2), 1)
        elif inference == 'beam':
          i = 0
          cx2 = x2[:,:1]
          # print(f'cx2 {cx2.size()}')
          y2, attn = model(x1, cx2)
          cy2 = y2[:,i,:]
          nx2 = torch.argmax(cy2, 1)
          nx2 = nx2.unsqueeze(1)
          emb_size = y2.size()[2]
          batch_size = y2.size()[0]
          # print(emb_size)
          beam_size = 2
          beam_depth = 4
          print('before')
          print(cy2[:,0])
          print(torch.max(cy2, 1))
          top = torch.topk(cy2, beam_size)
          bx2 = top[1].view(beam_size*batch_size, -1)
          print(bx2)
          cxx2 = torch.cat((cx2.repeat(beam_size, 1), nx2.repeat(beam_size, 1), bx2), 1)
          for j in range(beam_depth):
            # print(cx2.expand(-1, beam_size).view(beam_size*batch_size, -1))
            # print(cx2.)
            # print(cx2.repeat(beam_size, 1))
            print(cxx2)
            yy2, attn = model(x1.repeat(beam_size, 1), cxx2)
            # print(yy2)
            cyy2 = yy2[:,-1,:]
            nxx2 = torch.argmax(cyy2, 1)
            nxx2 = nxx2.unsqueeze(1)
            cxx2 = torch.cat((cxx2, nxx2), 1)
          # for j in range(1, emb_size): # pad problem
          #   beam = torch.full([batch_size, 1], j, dtype=int).to(device)
          #   cxx2 = torch.cat((cx2, nx2, beam), 1)
          #   # print(cxx2)
          #   # print(f'cxx2 {cxx2.size()}')
          #   yy2, attn = model(x1, cxx2)
          #   cyy2 = yy2[:,i+1,:]  # B, L, E -> B, E
          #   # Sum - prev_y = argmax(prev_y + cur_y)
          #   # print(cyy2.size(), cy2[:,j].size())
          #   # print(cy2)
          #   # print(cyy2[0])
          #   # print(cy2[0:,j])
          #   # B, E + B -> B, E + B, 1 -broadcasting-> B, E + B, E
          #   summ = torch.add(cyy2, cy2[:,j].unsqueeze(1))
          #   # print(summ[0])
          #   # print(summ.size())
          #   # print('max')
          #   max_summ = torch.max(summ, 1)[0]
          #   # print(max_summ)
          #   # print('before')
          #   # print(cy2[:,j])
          #   cy2[:,j] = max_summ
          #   # print('after')
          #   # print(cy2[:,j])
          #   # print(max_summ.size())
          #   # Viterbi - prev_y = argmax(cur_y)
          #   # nxx2 = torch.argmax(cy2, 1)
          #   # nx2 = nx2.unsqueeze(1)
          #   # cx2 = torch.cat((cx2, nx2), 1)
          print('after')
          print(cy2[:,0])
          print(torch.max(cy2, 1))
        p = y2[:,:-1,:].flatten(0, 1)
        loss = loss_fn(p, t)
        losses.append(loss.item())
        idx = torch.nonzero(t).view(-1)
        acc = torch.sum(torch.argmax(p, 1)[idx] == t[idx]).float()/idx.size()[0]
        accs.append(acc.item())
        ps = torch.argmax(y2[:,:-1,:], 2).detach().cpu().numpy()
        ts = x2[:,1:].detach().cpu().numpy()
        for sp, st in zip(ps, ts):
          t_set = set(st) - {0, 1, 2, 3}
          eos = sp[sp == 3][0] if 3 in sp else len(sp)
          p_eos = sp[:eos]
          p_set = set(p_eos) - {0, 1, 2, 3}
          i_set = t_set.intersection(p_set)
          pre = len(i_set)/len(p_set) if len(p_set) != 0 else 0
          rec = len(i_set)/len(t_set) if len(t_set) != 0 else 0
          f1 = 2*pre*rec/(pre + rec) if pre + rec != 0 else 0
          pres.append(pre)
          recs.append(rec)
          f1s.append(f1)
        pbar.set_description(f'{run_type} l: {loss.item():.2f}, a: {acc:.2f}, p: {pre:.2f}, r: {rec:.2f}, f1: {f1:.2f}')
  m_loss = sum(losses)/len(losses)
  m_acc = sum(accs)/len(accs)
  m_pre = sum(pres)/len(pres)
  m_rec = sum(recs)/len(recs)
  m_f1 = sum(f1s)/len(f1s)
  if print_triples:
    dy2 = map(ds.decode, torch.argmax(y2[:,:-1,:], 2).detach().cpu().numpy())
    dx2 = map(ds.decode, x2[:,1:].detach().cpu().numpy())
    dx1 = map(ds.decode, x1[:,1:].detach().cpu().numpy())
    triples = list(zip(dx1, dx2, dy2))[:1]
    for triple in triples:
      print(triple[0])
      print(triple[1])
      print(triple[2])
  if plot_attn:
    plot_attention(attn.detach().cpu().numpy()[:1], 
                   x1.detach().cpu().numpy()[:1], 
                   x2.detach().cpu().numpy()[:1],
                   decoder_x1 = ds.decode,
                   decoder_x2 = ds.decode,
                   figsize=(10, 10),
                   shift=True,
                   mask=True,
                   suptitle=None)
  return m_loss, m_acc, m_pre, m_rec, m_f1


def objective(trial):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  mem_report()
  torch.cuda.empty_cache()
  mem_report()
  trial.set_user_attr('device', str(device))
  opt_fn_map = {
    'SGD': optim.SGD,
    'Adam': optim.Adam,
    'Adagrad': optim.Adagrad,
    'ASGD': optim.ASGD,
    'Adamax': optim.Adamax,
    'SparseAdam': optim.SparseAdam,
    'AdamW': optim.AdamW,
    'Adadelta': optim.Adadelta,
    'LBFGS': optim.LBFGS,
    'RMSprop': optim.RMSprop,
    'Rprop': optim.Rprop
  }
  checkpoint = None
  # fn = 'model.pth'
  fn = f'model_{trial.study.study_name}_{trial.number}.pth'
  print(fn)
  # checkpoint = torch.load('model.pth')
  if checkpoint is None:
    train_config = {
      'seed': trial.suggest_int('seed', 0, 0),
      'n_epoches': trial.suggest_int('n_epoches', 5, 5),
      'lr': trial.suggest_loguniform('lr', 1e-4, 1e-2),
      # 'opt_fn': trial.suggest_categorical('opt_fn', ['SGD', 'Adam', 'Adagrad', 
      #                                           'ASGD', 'Adamax',
      #                                           'AdamW', 'Adadelta',
      #                                           'RMSprop', 'Rprop']),
      'opt_fn': trial.suggest_categorical('opt_fn', ['Adam']),
      'batch_size': trial.suggest_int('batch_size', 300, 300),
      'weight_decay': trial.suggest_loguniform('weight_decay', 1e-6, 1e-6),
      'nx1': trial.suggest_int('nx1', 1, 2)
    }
    model_config = {
      'enc_num_embeddings': len(ds.stoi),
      'enc_embedding_dim': trial.suggest_int('enc_embedding_dim', 300, 300),
      'enc_padding_idx': 0,
      'dec_num_embeddings': len(ds.stoi),
      'dec_embedding_dim': trial.suggest_int('dec_embedding_dim', 300, 300),
      'dec_padding_idx': 0,
      'rnn_type': trial.suggest_categorical('rnn_type', ['RNN']),
      # 'rnn_type': trial.suggest_categorical('rnn_type', ['RNN']),
      'hidden_size': trial.suggest_int('hidden_size', 300, 300),
      'num_layers': trial.suggest_int('num_layers', 1, 1),
      'bidirectional': trial.suggest_categorical('bidirectional', [False]),
      # 'bidirectional': trial.suggest_categorical('bidirectional', [True]),
      'enc_dropout': trial.suggest_uniform('enc_dropout', 0.0, 0.0),
      'dec_dropout': trial.suggest_uniform('dec_dropout', 0.0, 0.0),
      'enc_rnn_dropout': trial.suggest_uniform('enc_rnn_dropout', 0.0, 0.0),
      'dec_rnn_dropout': trial.suggest_uniform('dec_rnn_dropout', 0.0, 0.0),
      'attn_type': trial.suggest_categorical('attn_type', ['soft_dist']),
      # 'attn_type': trial.suggest_categorical('attn_type', ['dot', 'dist', 'soft_dot', 'soft_dist', 'cos', 'soft_cos', 'none']),
      'out_hidden': trial.suggest_int('out_hidden', 600, 600),
      'pack': trial.suggest_categorical('pack', [True]),
    }
  else:
    model_config = checkpoint['model_config']
    train_config = checkpoint['train_config']
  # train
  seed = train_config['seed']
  opt_fn = train_config['opt_fn']
  lr = train_config['lr']
  n_epoches = train_config['n_epoches']
  batch_size = train_config['batch_size']
  weight_decay = train_config['weight_decay']
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
  mem_report()
  model = EncoderDecoder(**model_config).to(device)
  if checkpoint is not None:
    model.load_state_dict(checkpoint['model_state_dict'])
  mem_report()
  if opt_fn not in ['Rprop']:
    opt = opt_fn_map[opt_fn](model.parameters(), lr=lr, 
                             weight_decay=weight_decay)
  else:
    opt = opt_fn_map[opt_fn](model.parameters(), lr=lr)
  loss_fn = torch.nn.NLLLoss(ignore_index=0)
  trial.set_user_attr('n_samples', len(ds))
  train_len = int(0.7*len(ds))
  test_len = int(0.2*len(ds))
  val_len = len(ds) - train_len - test_len
  lens = [train_len, test_len, val_len]
  # print(lens)
  train_ds, test_ds, val_ds = random_split(ds, lens)
  train_dl = DataLoader(train_ds, batch_size=batch_size, num_workers=1, 
                        shuffle=True, drop_last=False,
                        collate_fn=Collate(train_config['nx1']))
  test_dl = DataLoader(test_ds, batch_size=batch_size, num_workers=1, 
                        shuffle=True, drop_last=False,
                        collate_fn=Collate(train_config['nx1']))
  val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=1, 
                      shuffle=True, drop_last=False,
                        collate_fn=Collate(train_config['nx1']))
  pprint(trial.params)
  pbar_epoch = tqdm(range(n_epoches))
  for i in tqdm(pbar_epoch):
    # train
    train_scores = run('train', device, model, opt, loss_fn, train_dl, ds, 
                       print_triples=True, 
                       plot_attn=False if model_config['attn_type'] != 'none' else False)
    loss, acc, pre, rec, f1 = train_scores
    # validation
    val_scores = run('val', device, model, opt, loss_fn, val_dl, ds, 
                     print_triples=True, 
                     plot_attn=False if model_config['attn_type'] != 'none' else False)
    val_loss, val_acc, val_pre, val_rec, val_f1 = val_scores
    pbar_epoch.set_description(f'l: {loss:.2f}/{val_loss:.2f} a: {int(acc*100)}/{int(val_acc*100)} p: {int(pre*100)}/{int(val_pre*100)} r: {int(rec*100)}/{int(val_rec*100)} f1: {int(f1*100)}/{int(val_f1*100)}')
    trial.report(loss, step=i+1)
    torch.save({'epoch': i,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': opt.state_dict(),
                'loss': loss,
                'model_config': model_config,
                'train_config': train_config},
                fn)
    try:
      copyfile(fn, '/content/drive/My Drive/' + fn)
    except Exception as e:
      print(e)
  # test
  test_scores = run('test', device, model, opt, loss_fn, test_dl, ds,
                                                         print_triples=True, 
                                                         plot_attn=True if model_config['attn_type'] != 'none' else False)
  test_loss, test_acc, test_pre, test_rec, test_f1 = test_scores
  print(f'           train\tval\ttest')
  print(f'     loss: {loss:.3f}\t{val_loss:.3f}\t{test_loss:.3f}')
  print(f' accuracy: {acc:.3f}\t{val_acc:.3f}\t{test_acc:.3f}')
  print(f'precision: {pre:.3f}\t{val_pre:.3f}\t{test_pre:.3f}')
  print(f'   recall: {rec:.3f}\t{val_rec:.3f}\t{test_rec:.3f}')
  print(f'       f1: {f1:.3f}\t{val_f1:.3f}\t{test_f1:.3f}')
  trial.set_user_attr('train_loss', loss)
  trial.set_user_attr('train_acc', acc)
  trial.set_user_attr('train_pre', pre)
  trial.set_user_attr('train_rec', rec)
  trial.set_user_attr('train_f1', f1)
  trial.set_user_attr('val_loss', val_loss)
  trial.set_user_attr('val_acc', val_acc)
  trial.set_user_attr('val_pre', val_pre)
  trial.set_user_attr('val_rec', val_rec)
  trial.set_user_attr('val_f1', val_f1)
  trial.set_user_attr('test_loss', test_loss)
  trial.set_user_attr('test_acc', test_acc)
  trial.set_user_attr('test_pre', test_pre)
  trial.set_user_attr('test_rec', test_rec)
  trial.set_user_attr('test_f1', test_f1)
  return loss

fn = 'optuna.db'
study = optuna.create_study(study_name='1', 
                            direction='minimize', 
                            storage=f'sqlite:///{fn}', 
                            load_if_exists=True)
study.optimize(objective, n_trials=10)
try:
  copyfile(fn, '/content/drive/My Drive/' + fn)
except Exception as e:
  print(e)

In [0]:
# try:
    
# except KeyboardInterrupt:
#   print(e)
# except Exception as e:
#   print(e)
# finally:
#   torch.save({'epoch': i,
#               'model_state_dict': model.state_dict(),
#               'optimizer_state_dict': opt.state_dict(),
#               'loss': loss,
#               'model_config': model_config,
#               'train_config': train_config},
#               'model_exception.pth')
#   try:
#     copyfile('model.pth', '/content/drive/My Drive/' + 'model.pth')
#   except Exception as e:
#     print(e)
# raise KeyboardInterrupt

In [0]:
from pprint import pprint
pprint(study.best_params)
pprint(study.best_value)
pprint(study.best_trial)
pprint(study.direction)
%load_ext google.colab.data_table
study.trials_dataframe()

In [0]:
# optuna.visualization.plot_contour(study, params=['out_hidden', 'attn_type'])
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_slice(study)
# optuna.visualization.plot_parallel_coordinate(study)  # BUG
optuna.visualization.plot_intermediate_values(study)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint = torch.load('model.pth')

model = EncoderDecoder(**checkpoint['model_config']).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [0]:
import torch

def inference(model, ds, s1):
  ex2 = [ds.stoi['<go>']]
  ex1 = ds.encode(s1)
  # print(ex1)
  # print(ex2)
  # print(ds.decode(ex1))
  # print(ds.decode(ex2))
  max_len = 10
  model.eval()
  cnt = 1
  with torch.no_grad():
    while ex2[-1] != ds.stoi['<eos>'] and cnt < max_len:
      x1 = torch.tensor([ex1]).to(device)
      x2 = torch.tensor([ex2]).to(device)
      y2, attn = model(x1, x2)
      # print(y2.size())
      p = y2[:,-1,:].flatten(0, 1)
      # print(p)
      px2 = torch.argmax(p)
      # print(p2.item())
      ex2.append(px2.item())
      cnt += 1
  return ds.decode(ex2)

In [0]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

title = 'Агент Жиго отреагировал на новость об интересе «Лиона» к футболисту «Спартака»'
text = 'Мохамед Беншенафи, агент защитника московского «Спартака» Самуэля Жиго, опроверг информацию об интересе к игроку со стороны французского футбольного клуба «Лион».'
ss = [' '.join([w.lower() for w in word_tokenize(s) if w.isalnum()]) for s in sent_tokenize(text)]
pred_title = inference(model, ds, ss[0])
print(ss)
print(title)
print(pred_title)

In [0]:
from pprint import pprint
import gensim.downloader as api
info = api.info()
pprint(info['models'].keys())
vec_model = api.load('word2vec-ruscorpora-300')

In [0]:
# https://github.com/akutuzov/webvectors/blob/master/preprocessing/rusvectores_tutorial.ipynb

# print(model.most_similar("кто"))
print(len(model.vocab))
print(list(model.vocab)[:100])
print(dir(model))

In [0]:
import spacy

In [0]:
!pip install ufal.udpipe

https://universaldependencies.org/u/pos/

In [0]:
import sys
import os
# import wget
import re
from ufal.udpipe import Model, Pipeline

'''
Этот скрипт принимает на вход необработанный русский текст 
(одно предложение на строку или один абзац на строку).
Он токенизируется, лемматизируется и размечается по частям речи с использованием UDPipe.
На выход подаётся последовательность разделенных пробелами лемм с частями речи 
("зеленый_NOUN трамвай_NOUN").
Их можно непосредственно использовать в моделях с RusVectōrēs (https://rusvectores.org).
Примеры запуска:
echo 'Мама мыла раму.' | python3 rus_preprocessing_udpipe.py
zcat large_corpus.txt.gz | python3 rus_preprocessing_udpipe.py | gzip > processed_corpus.txt.gz
'''


def num_replace(word):
    newtoken = 'x' * len(word)
    return newtoken


def clean_token(token, misc):
    """
    :param token:  токен (строка)
    :param misc:  содержимое поля "MISC" в CONLLU (строка)
    :return: очищенный токен (строка)
    """
    out_token = token.strip().replace(' ', '')
    if token == 'Файл' and 'SpaceAfter=No' in misc:
        return None
    return out_token


def clean_lemma(lemma, pos):
    """
    :param lemma: лемма (строка)
    :param pos: часть речи (строка)
    :return: очищенная лемма (строка)
    """
    out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
    if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
        return None
    if pos != 'PUNCT':
        if out_lemma.startswith('«') or out_lemma.startswith('»'):
            out_lemma = ''.join(out_lemma[1:])
        if out_lemma.endswith('«') or out_lemma.endswith('»'):
            out_lemma = ''.join(out_lemma[:-1])
        if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
                or out_lemma.endswith('.'):
            out_lemma = ''.join(out_lemma[:-1])
    return out_lemma


def list_replace(search, replacement, text):
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text


def unify_sym(text):  # принимает строку в юникоде
    text = list_replace \
        ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

    text = list_replace \
        ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

    text = list_replace('\u2010\u2011', '\u002D', text)

    text = list_replace \
            (
            '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
            '\u2002', text)

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace \
            (
            '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
            '.', text)

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u2241\u224B\u2E2F\u0483', '\u223D', text)

    text = list_replace('\u00C4', 'A', text)  # латинская
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)

    currencies = list \
            (
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
        )

    alphabet = list \
            (
            '\t\n\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ,.[]{}()=+-−*&^%$#@!?~;:0123456789§/\|"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

    alphabet.append("'")

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text


def process(pipeline, text='Строка', keep_pos=True, keep_punct=False):
    # Если частеречные тэги не нужны (например, их нет в модели), выставьте pos=False
    # в этом случае на выход будут поданы только леммы
    # По умолчанию знаки пунктуации вырезаются. Чтобы сохранить их, выставьте punct=True

    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]

    for t in tagged:
        if len(t) != 10:
            continue
        (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(token, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            continue
        if pos in entities:
            if '|' not in feats:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
            if 'Case' not in morph or 'Number' not in morph:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            if not named:
                named = True
                mem_case = morph['Case']
                mem_number = morph['Number']
            if morph['Case'] == mem_case and morph['Number'] == mem_number:
                memory.append(lemma)
                if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
                    named = False
                    past_lemma = '::'.join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + '_PROPN')
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN')
                tagged_propn.append('%s_%s' % (lemma, pos))
        else:
            if not named:
                if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
                    lemma = num_replace(token)
                tagged_propn.append('%s_%s' % (lemma, pos))
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN')
                tagged_propn.append('%s_%s' % (lemma, pos))

    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return tagged_propn

import requests
import os 

# URL of the UDPipe model
udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
udpipe_filename = udpipe_model_url.split('/')[-1]

if not os.path.isfile(udpipe_filename):
    print('UDPipe model not found. Downloading...', file=sys.stderr)
    r = requests.get(udpipe_model_url)
    with open(udpipe_filename, 'wb') as f:
      f.write(r.content)

print('\nLoading the model...', file=sys.stderr)
model = Model.load(udpipe_filename)
process_pipeline = Pipeline(model, 'tokenize', 
                            Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

# print('Processing input...', file=sys.stderr)
# for line in sys.stdin:
#     res = unify_sym(line.strip())
#     output = process(process_pipeline, text=res)
#     print(' '.join(output))

In [0]:
# from torch.utils.data import Dataset, IterableDataset, DataLoader, random_split
# import json

# from tqdm.notebook import tqdm

# news = []
# n_news = 1000
# with gzip.open(comp_fn, 'rb') as gz_file:
#   for _ in tqdm(range(n_news)):
#     news.append(json.loads(next(gz_file)))
# print(news[0])
# print(len(news))

# from collections import Counter

# words = Counter()
# for n in news:
#   words.update(w for s in n['text'] for w in s.split())
#   words.update(w for s in n['title'] for w in s.split())
# print(words.most_common(10))
# print(len(words))

# from torchtext.vocab import Vocab, SubwordVocab
# min_freq = 0
# max_size = 40000
# voc = Vocab(counter=words, max_size=max_size, specials=['<pad>', '<unk>', '<go>', '<eos>'])
# # voc = SubwordVocab(counter=words, max_size=max_size, specials=['<pad>', '<unk>', '<go>', '<eos>'])
# print(len(voc))
# print(dir(voc))
# print(voc.stoi)

# class RiaDataset(IterableDataset):
#   def __init__(self, path, start, end, vocab):
#     super(RiaDataset).__init__()
#     self.path = path
#     self.start = start
#     self.end = end
#     self.vocab = vocab

#   def __len__(self):
#     return self.end - self.start

#   def __iter__(self):
#     worker_info = torch.utils.data.get_worker_info()
#     # print(worker_info.id, id(self), self.start, self.end)
#     with open(self.path) as f:
#       for _ in range(self.start):
#         next(f)
#       for _ in range(self.start, self.end):
#         yield self.preprocess(next(f))

#   def preprocess(self, data):
#     data = json.loads(data)
#     enc_text = [[self.vocab.stoi.get(w, 1) for w in s.split()] for s in data['text']]
#     enc_title = [[self.vocab.stoi.get(w, 1) for w in s.split()] for s in data['title']]
#     # dec_text = [[self.vocab.itos[w] for w in s] for s in enc_text]
#     # enc_text = [[w for w in s.split()] for s in data['text']]
#     # enc_text = [w for s in data['text'] for w in s.split()]
#     # enc_title = [w for s in data['title'] for w in s.split()]
#     enc_title = [self.vocab.stoi.get(w, 1) for s in data['title'] for w in s.split()]
#     enc_text = [self.vocab.stoi.get(w, 1) for s in data['text'] for w in s.split()][:len(enc_title)*3]
#     return [enc_text, enc_title]

# def worker_init_fn(worker_id):
#   worker_info = torch.utils.data.get_worker_info()
#   s, e = worker_info.dataset.start, worker_info.dataset.end
#   n = worker_info.num_workers
#   q, r = (e - s) // n, (e - s) % n
#   s = s + worker_id * q
#   e = s + q if worker_id < n - 1 else s + q + r
#   worker_info.dataset.start, worker_info.dataset.end = s, e

# def collate_fn(batch):
#   bx1, bx2 = [], []
#   for b in batch:
#     bx1.append(torch.tensor(b[0]))
#     bx2.append(torch.tensor(b[1]))
#   bx1 = torch.nn.utils.rnn.pad_sequence(bx1, batch_first=True)
#   bx2 = torch.nn.utils.rnn.pad_sequence(bx2, batch_first=True)
#   batch = [bx1, bx2]
#   return batch

# ds = RiaDataset('norm_sents_ria_1k.json', 0, 100, voc)
# dl = DataLoader(ds, batch_size=3, num_workers=1, shuffle=False, drop_last=False, 
#                                  collate_fn=collate_fn, worker_init_fn=worker_init_fn)
# for b in tqdm(dl):
#   print(b[0].size(), b[1].size())
#   print(len(b))
# # train_len = int(0.7*len(ds))
# # test_len = int(0.2*len(ds))
# # val_len = len(ds) - train_len - test_len
# # lens = [train_len, test_len, val_len]
# # print(lens)
# # train_ds, test_ds, val_ds = random_split(ds, lens)
# # dl = DataLoader(train_ds, batch_size=2, num_workers=1, shuffle=False, drop_last=False, 
# #                                  collate_fn=collate_fn, worker_init_fn=worker_init_fn)
# # # print(dir(train_ds))
# # # print(train_ds.indices)
# # for b in tqdm(dl):
# #   print(b[0].size(), b[1].size())
# #   print(len(b))

http://wiki.nlpl.eu/index.php/Home

In [0]:
import zipfile
import os

model_url = 'http://vectors.nlpl.eu/repository/20/184.zip'  # Russian News
fn = os.path.basename(model_url)
if not os.path.exists(fn):
  r = requests.get(model_url)
  with open(fn, 'wb') as f:
    f.write(r.content)

with zipfile.ZipFile(fn) as f:
  stream = f.open('model.bin')
  vec_model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [0]:
vec_model = api.load('word2vec-ruscorpora-300')

In [0]:
text = news[0]['text'][1]
print(text)
tagged = process(process_pipeline, text)
print(tagged)

In [0]:
vec_model = api.load('word2vec-ruscorpora-300')
vs = [vec_model[x].shape for x in tagged if x in vec_model]
print(vec_model[tagged[0]])
print(len(tagged), len(vs))

In [0]:
import multiprocessing
import os

print(multiprocessing.cpu_count())
print(len(os.sched_getaffinity(0)))

http://vectors.nlpl.eu/repository/