In [0]:
from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')
!unzip "/content/gdrive/My Drive/Colab Notebooks/corpus.zip"

df2 = pd.read_csv('corpus.csv')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
Archive:  /content/gdrive/My Drive/Colab Notebooks/corpus.zip
  inflating: corpus.csv              


In [0]:
df = df2.drop(columns=['story', 'url', 'descr'])
df = df.drop_duplicates()

In [0]:
max_length = df['title'].map(lambda x: len(x)).max()

df['y_title'] = df.apply(lambda row: row.title[1:] + "︱", axis=1)
df['title'] = df.apply(lambda row: row.title + "〜" * (max_length-len(row.title)), axis=1)
df['y_title'] = df.apply(lambda row: row.y_title + "〜" * (max_length-len(row.y_title)), axis=1)

In [0]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

text=df['title'].str.cat()
text_y=df['y_title'].str.cat()

chars = tuple(set(text+'︱'))
vocab = dict(enumerate(chars))
to_int = {ch: ii for ii, ch in vocab.items()}

encoded_x = np.array([to_int[ch] for ch in text])
encoded_y = np.array([to_int[ch] for ch in text_y])

In [0]:
def one_hot(arr, n_labels):
  one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
  one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
  one_hot = one_hot.reshape((*arr.shape, n_labels))
  return one_hot

In [0]:
def batches(arr, arr_y, batch_size):
  batch_size_total = batch_size * max_length
  n_batches = len(arr)//batch_size_total

  # Keep only enough characters to make full batches
  arr = arr[:n_batches * batch_size_total]
  arr = arr.reshape((batch_size, -1))

  arr_y = arr_y[:n_batches * batch_size_total]
  arr_y = arr_y.reshape((batch_size, -1))

  for n in range(0, arr.shape[1], max_length):
    x = arr[:, n:n+max_length]
    y = arr_y[:, n:n+max_length]
    yield x, y

In [0]:
class LSTM(nn.Module):
    
  def __init__(self, tokens, n_hidden, n_layers, drop_prob, lr):
    super().__init__()
    self.drop_prob = drop_prob
    self.n_layers = n_layers
    self.n_hidden = n_hidden
    self.lr = lr
    self.chars = tokens
    self.vocab = dict(enumerate(self.chars))
    self.to_int = {ch: ii for ii, ch in self.vocab.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))

  def forward(self, x, hidden):
    #get the outputs and the new hidden state from the lstm
    r_output, hidden = self.lstm(x, hidden)
    out = self.dropout(r_output)
    out = out.contiguous().view(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden


  def hidden_state(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
    return hidden

In [0]:
def train(net, x_data, y_data, epochs, batch_size, max_length, lr, clip, val_percentage, print_every):
  net.train()
  opt = torch.optim.Adam(net.parameters(), lr=lr)
  loss_obj = nn.CrossEntropyLoss()

  val_ids = int(len(x_data)*(1-val_percentage))
  x_data, val_x = x_data[:val_ids], x_data[val_ids:]
  y_data, val_y = y_data[:val_ids], y_data[val_ids:]

  net.cuda()

  counter = 0
  n_chars = len(net.chars)
  for e in range(epochs):
    h = net.hidden_state(batch_size)

    for x, y in batches(x_data, y_data, batch_size):
      counter += 1

      x = one_hot(x, n_chars)
      inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
      inputs, targets = inputs.cuda(), targets.cuda()
      h = tuple([each.data for each in h])
      net.zero_grad()
      
      output, h = net(inputs, h)

      loss = loss_obj(output, targets.view(batch_size*max_length).long())
      loss.backward()
      opt.step()

      if counter % print_every == 0:
        val_h = net.hidden_state(batch_size)
        val_losses = []
        net.eval()
        for x, y in batches(val_x, val_y, batch_size):
          x = one_hot(x, n_chars)
          x, y = torch.from_numpy(x), torch.from_numpy(y)
          val_h = tuple([each.data for each in val_h])
          inputs, targets = x, y
          inputs, targets = inputs.cuda(), targets.cuda()
          output, val_h = net(inputs, val_h)
          val_loss = loss_obj(output, targets.view(batch_size*max_length).long())
          val_losses.append(val_loss.item())

        net.train()

        print("Epoch: {}/{}...".format(e+1, epochs),
              "Step: {}...".format(counter),
              "Loss: {:.4f}...".format(loss.item()),
              "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [0]:
n_hidden=512
n_layers=2

net = LSTM(chars, n_hidden, n_layers)
print(net)

batch_size = 128
n_epochs = 20
train(net, encoded_x, encoded_y, epochs=n_epochs, batch_size=batch_size, max_length=max_length, lr=0.001, print_every=100)

LSTM(
  (lstm): LSTM(181, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=181, bias=True)
)
Epoch: 1/20... Step: 100... Loss: 1.9232... Val Loss: 1.3955
Epoch: 1/20... Step: 200... Loss: 1.8015... Val Loss: 1.2957
Epoch: 1/20... Step: 300... Loss: 1.5018... Val Loss: 1.0855
Epoch: 1/20... Step: 400... Loss: 1.3636... Val Loss: 1.0310
Epoch: 1/20... Step: 500... Loss: 1.3304... Val Loss: 0.9899
Epoch: 1/20... Step: 600... Loss: 1.2822... Val Loss: 0.9522
Epoch: 2/20... Step: 700... Loss: 1.2721... Val Loss: 0.9316
Epoch: 2/20... Step: 800... Loss: 1.2276... Val Loss: 0.8927
Epoch: 2/20... Step: 900... Loss: 1.2046... Val Loss: 0.8655
Epoch: 2/20... Step: 1000... Loss: 1.0590... Val Loss: 0.8388
Epoch: 2/20... Step: 1100... Loss: 1.1144... Val Loss: 0.8214
Epoch: 2/20... Step: 1200... Loss: 1.0601... Val Loss: 0.8016
Epoch: 2/20... Step: 1300... Loss: 1.0663... Val Loss: 0.7802
Epoch: 3/20... Step: 1400... Loss: 1

In [0]:
def predict(net, char, h=None):
  x = np.array([[net.to_int[char]]])
  x = one_hot(x, len(net.chars))
  inputs = torch.from_numpy(x)

  inputs = inputs.cuda()

  h = tuple([each.data for each in h])
  out, h = net(inputs, h)

  # get the character probabilities
  p = F.softmax(out, dim=1).data
  p = p.cpu()

  p, top_ch = p.topk(5)
  top_ch = top_ch.numpy().squeeze()

  # select the next character
  p = p.numpy().squeeze()
  char = np.random.choice(top_ch, p=p/p.sum())
  
  return net.vocab[char], h

In [0]:
def make_text(net, start='Судь'):
        
  net.cuda()
  net.eval()

  chars = [ch for ch in start]
  h = net.hidden_state(1)
  for ch in start:
    char, h = predict(net, ch, h)

  chars.append(char)

  while (char!="︱"):
    char, h = predict(net, chars[-1], h)
    chars.append(char)

  return ''.join(chars)

In [0]:
print(make_text(net, start='Судь'))

Судья и еще 3 причины проблемы с «Барселоной»︱


In [0]:
print(make_text(net, start='Роналд'))

Роналду признан лучшим игроком месяца в НХЛ︱


In [0]:
print(make_text(net, start='Месс'))

Месси и Карлос Тевес подписал контракт с «Барселоной»︱


In [0]:
print(make_text(net, start='Барселон'))

Барселона» проиграла «Вальядолидо»︱


In [0]:
print(make_text(net, start='Ф'))

Федерер обыграл Давида Феррера и вышел в четвертый круг︱


In [0]:
print(make_text(net, start='Хокке'))

Хоккейный полуфинал Кубка Стэнли в последнем матче подряд︱


In [0]:
print(make_text(net, start='В'))

Вилья Кербер выиграла спринт в Майами︱
