In [2]:
# import stuff
from io import open
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
from RuNNE import RuNNEBuilder
builder = RuNNEBuilder()
builder.download_and_prepare()
dataset = builder.as_dataset()

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

In [4]:
with open('ent_types.txt') as file:
    classes = [line.rstrip() for line in file]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

classes = ['NONE'] + classes

NUM_CLASSES = len(classes)

In [5]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('blinoff/roberta-base-russian-v0', max_len=512)

NUM_TOKENS = len(tokenizer.get_vocab())

In [5]:
train = dataset['train']
test = dataset['test']
dev = dataset['dev']

MAX_LENGTH = 512
MAX_LABELS = 128

def pad_or_truncate(some_list, target_len, default=0):
    return some_list[:target_len] + [default]*(target_len - len(some_list))

def preprocess_function(example):
    text = example['text']
    ids = tokenizer(text)['input_ids']
    all_labels = example['entities']
    split_labels = list(map(lambda x: x.split(), all_labels))
    labels = []
    
    for label in split_labels:
        start = int(label[0]) / len(text)
        end = int(label[1]) / len(text)
        id = class2id[label[2]]
        labels.append((start, end, id))
    labels = sorted(labels)
    
    ids = pad_or_truncate(ids, MAX_LENGTH)
    labels = pad_or_truncate(labels, MAX_LABELS, (-1, -1, 0))
    
    bounds = []
    _classes = []
    #seq_len = len(ids)
    for label in labels:
        bounds.append((label[0], label[1]))
        _classes.append(label[2])
        
    return {'tokens': ids, 'bounds': bounds, 'classes': _classes}
    #return {'tokens': ids, 'labels': labels}
    
def onehot(example):
    return {'classes': F.one_hot(example['classes'], NUM_CLASSES)}
    
    
    
train_processed = train.map(preprocess_function, batched=False)
test_processed = test.map(preprocess_function, batched=False)
#dev_processed = dev.map(preprocess_function, batched=True)

train_processed = train_processed.with_format("torch")
test_processed = test_processed.with_format("torch")
    
#train_processed = train_processed.map(onehot, batched=False)
#test_processed = test_processed.map(onehot, batched=False)

In [6]:
def get_dataloader(dataset, batch_size=1):
    
    tokens = dataset['tokens']
    bounds = dataset['bounds']
    _classes = dataset['classes']
    
    train_data = TensorDataset(torch.LongTensor(tokens).to(device),
                               torch.FloatTensor(bounds).to(device),
                               torch.LongTensor(_classes).to(device),)
    
    dataloader = DataLoader(train_data, batch_size=batch_size)
    return dataloader

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [7]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class ClassifierDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(ClassifierDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LABELS):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        attentions = torch.cat(attentions, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        #print(input.shape)
        embedded =  self.dropout(self.embedding(input))
        #print(embedded.shape)

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [8]:
class RegressionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(RegressionDecoder, self).__init__()
        self.output_size = output_size
        self.embedding = nn.Linear(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, self.output_size, dtype=torch.float, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LABELS):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                decoder_input = decoder_output.detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        
        embedded = self.dropout(self.embedding(input))
        if embedded.dim() == 2:
            embedded = embedded.unsqueeze(1)
            
        #print(embedded.shape)
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        #print(embedded.shape, context.shape)        
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)
        output = F.relu(output)

        return output, hidden, attn_weights

In [10]:
def train_epoch(dataloader, encoder, decoder1, decoder2, encoder_optimizer,
          decoder1_optimizer, decoder2_optimizer, criterion1, criterion2):

    total_loss1 = 0
    total_loss2 = 0
    for data in dataloader:
        input_tensor, target_bounds, target_classes = data

        encoder_optimizer.zero_grad()
        decoder1_optimizer.zero_grad()
        decoder2_optimizer.zero_grad()
        
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs1, _, _ = decoder1(encoder_outputs, encoder_hidden, target_bounds)
        decoder_outputs2, _, _ = decoder2(encoder_outputs, encoder_hidden, target_classes)
        
        #print(decoder_outputs1.view(-1).shape, target_bounds.view(-1).shape)
        #print(decoder_outputs2.shape, target_classes.shape)

        loss1 = criterion1(
            decoder_outputs1.view(-1),
            target_bounds.view(-1)
        )
        loss1.backward(retain_graph=True)

        loss2 = criterion2(
            decoder_outputs2.view(-1, decoder_outputs2.size(-1)),
            target_classes.view(-1)
        )
        loss2.backward()
        
        encoder_optimizer.step()
        decoder1_optimizer.step()
        decoder2_optimizer.step()

        total_loss1 += loss1.item()
        total_loss2 += loss2.item()

    return total_loss1 / len(dataloader), total_loss2 / len(dataloader)

In [11]:
from tqdm.notebook import trange

def train(train_dataloader, encoder, decoder1, decoder2, n_epochs, learning_rate=0.001):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder1_optimizer = optim.Adam(decoder1.parameters(), lr=learning_rate)
    decoder2_optimizer = optim.Adam(decoder2.parameters(), lr=learning_rate)
    criterion1 = nn.L1Loss()
    criterion2 = nn.NLLLoss()
    
    bar = trange(1, n_epochs + 1)

    for epoch in bar:
        loss1, loss2 = train_epoch(train_dataloader, encoder, decoder1, decoder2, encoder_optimizer, decoder1_optimizer, decoder2_optimizer, criterion1, criterion2)
        bar.set_description(f"loss1={loss1} loss2={loss2}")

In [12]:
batch_size = 24

train_loader = get_dataloader(train_processed, batch_size)
test_loader = get_dataloader(test_processed, batch_size)

In [13]:
hidden_size = 128

encoder = EncoderRNN(NUM_TOKENS, hidden_size).to(device)
decoder1 = RegressionDecoder(hidden_size, 2).to(device)
decoder2 = ClassifierDecoder(hidden_size, NUM_CLASSES).to(device)

train(train_loader, encoder, decoder1, decoder2, 100)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
torch.save(encoder.state_dict(), 'dualnorm3/encoder.ckpt')
torch.save(decoder1.state_dict(), 'dualnorm3/decoder1.ckpt')
torch.save(decoder2.state_dict(), 'dualnorm3/decoder2.ckpt')

In [9]:
hidden_size = 128
encoder = EncoderRNN(NUM_TOKENS, hidden_size).to(device)
decoder1 = RegressionDecoder(hidden_size, 2).to(device)
decoder2 = ClassifierDecoder(hidden_size, NUM_CLASSES).to(device)

encoder.load_state_dict(torch.load('dualnorm3/encoder.ckpt'))
decoder1.load_state_dict(torch.load('dualnorm3/decoder1.ckpt'))
decoder2.load_state_dict(torch.load('dualnorm3/decoder2.ckpt'))

<All keys matched successfully>

In [10]:
def evaluate(encoder, decoder1, decoder2, sentence, tokenizer):
    with torch.no_grad():
        input_tensor = torch.LongTensor(tokenizer(sentence)['input_ids']).view(1, -1).to(device)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder1_outputs, _, _ = decoder1(encoder_outputs, encoder_hidden)
        decoder2_outputs, _, _ = decoder2(encoder_outputs, encoder_hidden)
        
        _, topi = decoder2_outputs.topk(1)
        decoded_ids = topi.squeeze()
        
        decoder1_outputs = decoder1_outputs * len(sentence)
        decoder1_outputs = decoder1_outputs.squeeze().round().type(torch.int64)

        decoded_classes = []
        for idx in decoded_ids:
            if idx.item() == 0:
                break
            decoded_classes.append(id2class[idx.item()])
            
        decoded_bounds = []
        for i in range(len(decoded_classes)):
            decoded_bounds.append((decoder1_outputs[i][0].item(), decoder1_outputs[i][1].item()))
            
        strings = []
        for i in range(len(decoded_classes)):
            strings.append([decoded_bounds[i][0], decoded_bounds[i][1], decoded_classes[i]])
    return strings

In [19]:
dataset['test']['text'][0]

'Владелец «Бирмингема» получил шесть лет тюрьмы\nмини|слева|«Сент-Эндрюс» — домашний стадион футбольного клуба «Бирмингем Сити»\nВ пятницу, 7 марта суд Гонконга приговорил владельца футбольного клуба «Бирмингем Сити» Карсона Ёнга (Carson Yeung, также в некоторых источниках — Карсон Юнг; Карсон Ён) к шести годам тюремного заключения за мошенничество.\n\n54-летний бизнесмен был признан виновным в отмывании 55 миллионов фунтов стерлингов через его банковские счета в период с 2001 по 2007 годы.\n\nКарсон Ёнг стал владельцем «Бирмингема» в 2009 году, приобретя его за 81,5 миллионов фунтов стерлингов.\n'

In [20]:
from tqdm.notebook import trange
test = dataset['dev']
MAX_LENGTH = 512
MAX_LABELS = 128
results = []
for i in trange(len(test)):
    sentence = test['text'][i]
    id = test['id'][i]
    res = evaluate(encoder, decoder1, decoder2, sentence, tokenizer)
    results.append({'id': id, 'sentences': sentence, 'ners': res})

  0%|          | 0/65 [00:00<?, ?it/s]

In [16]:
import json
with open('test.jsonl', 'w', encoding='utf-8') as f:
    for res in results:
        f.write(json.JSONEncoder().encode(res)+'\n')