In [1]:
!git clone https://github.com/ArtemNechaev/stepik_nnets 
!pip install youtokentome

Cloning into 'stepik_nnets'...
remote: Enumerating objects: 159, done.[K
remote: Counting objects: 100% (159/159), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 159 (delta 76), reused 64 (delta 24), pack-reused 0[K
Receiving objects: 100% (159/159), 2.38 MiB | 3.38 MiB/s, done.
Resolving deltas: 100% (76/76), done.
Collecting youtokentome
  Downloading youtokentome-1.0.6-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.6
[0m

In [4]:
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from collections import OrderedDict, Counter

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, FastText, vocab
from torchtext.data.metrics import bleu_score
from torchtext.datasets import Multi30k
from typing import Iterable, List

from stepik_nnets.sec2sec.data import Sec2SecDataset, sequential_transforms, TensorTransform, myFastText
from stepik_nnets.sec2sec.models import OnlyGRU, Seq2SeqTransformer

from stepik_nnets.sec2sec.engine import train, evaluate, predict_with_model, data_to_device
from stepik_nnets.sec2sec.beam_search import  beam_search_transformer

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import youtokentome as yttm

import random
import math
import time
from tqdm import tqdm
import pickle

import zipfile


In [25]:
from typing import Sequence
import torch

def beam_search(dec_out, indeces, scores, beam_size, pad_idx, eos_idx):
    batch_size = int(dec_out.shape[1]/beam_size)
    
    # 1 x batch_size*beam_size x vocab_size
    new_score = dec_out.log_softmax(-1) 

    # 1 x batch_size*beam_size x beam_size
    new_score, new_idx = new_score.topk(beam_size)

    new_score[:, torch.where(indeces[-1].flatten() == eos_idx)[0]] = 0
    scores = scores.reshape(1, batch_size*beam_size, 1) + new_score

    lens_hyp = ((indeces != eos_idx) & (indeces != pad_idx)).sum(0, keepdim=True).reshape(1, batch_size*beam_size, 1)
    scores = scores/(lens_hyp+1)**0.5

    ######scores, new_idx = scores.topk(beam_size) #1 x  batch_size*beam_size x beam_size
    flatten_scores = scores.reshape(1,batch_size, beam_size ** 2)

    # 1 x batch_size x beam_size. order_scores [0 .. beam_size**2 - 1]
    scores, order_scores = flatten_scores.topk(beam_size) 

    
    # calc beam indexes that got top beam_size scores. beam_ids [0 .. beam_size - 1]
    beam_ids = torch.div(order_scores, beam_size, rounding_mode='floor') 

    #select beams with top scores from indeces and concat new idx.
    #start of sentences could repeat.

    indeces = torch.cat([
                         torch.gather(indeces, -1, beam_ids.repeat(indeces.shape[0], 1, 1)),
                         torch.gather(new_idx.view(1, batch_size, -1),-1,order_scores)
    ])
    
    return indeces, scores, beam_ids

def beam_search_rnn( model, src, beam_size=5, max_len=20):
  """
  src - src_len x batch_size
  return tensor max_len x batch_size x beam_size
  """
  model.eval()
  device = model.device

  src = src.to(model.device)
  batch_size = src.shape[1]

  # src_len x batch_size x hidden_size
  encoder_outputs, h_0 = model.encode(src) 

  # 1 x batch_size
  input_trg = src[0].unsqueeze(0) 
  mask = model.create_pad_mask(src, input_trg)

  # 1 x batch_size x vocab_size
  first_decode, att, h_0 = model.decode(encoder_outputs, input_trg, mask, h_0=h_0 ) 
  first_decode = first_decode.log_softmax(-1)

  # 1 x batch_size x beam_size
  scores, indeces = first_decode.topk(beam_size) 

  # decoder hidden_state. num_layers * D, batch_size*beam_size, hidden
  h_0 = h_0.unsqueeze(2).repeat(1,1,beam_size,1).reshape(h_0.shape[0], batch_size*beam_size, -1) 

  # src_len x batch_size*beam_size x hidden_size
  encoder_outputs = encoder_outputs.unsqueeze(2).repeat(1,1,beam_size,1).reshape(src.shape[0], batch_size*beam_size, -1)

  # src_len x batch_size*beam_size
  src = src.unsqueeze(2).repeat(1,1,beam_size).reshape(src.shape[0], batch_size*beam_size)

  for i in range(1, max_len):
    input_trg = indeces[-1].view(1, batch_size * beam_size )
    mask = model.create_pad_mask(src, input_trg)
    dec_out , att, h_0 = model.decode(encoder_outputs, input_trg, mask, h_0=h_0)
    
    indeces, scores, beam_ids = beam_search(dec_out, indeces, scores, beam_size, model.pad_idx, model.eos_idx)

    #select beams with top scores from decoder hidden_state
    h_0 = h_0.reshape(h_0.shape[0], batch_size, beam_size, -1)
    h_0 = torch.gather(h_0, 2, beam_ids.unsqueeze(3).repeat(h_0.shape[0], 1, 1, h_0.shape[3]))
    h_0 = h_0.reshape(h_0.shape[0], batch_size*beam_size, -1)


  return indeces, scores


def beam_search_transformer( model, src, beam_size=5, max_len=20):
  """
  src - src_len x batch_size
  return tensor max_len x batch_size x beam_size
  """
  model.eval()
  device = model.device

  src = src.to(model.device)
  input_trg = src[0].unsqueeze(0) 
  batch_size = src.shape[1]
  
  src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = model.create_mask(src, input_trg)
    
  # src_len x batch_size x hidden_size
  encoder_outputs = model.encode(src, src_mask, src_padding_mask) 

  # 1 x batch_size x vocab_size
  first_decode = model.decode(input_trg, encoder_outputs,  tgt_mask) 
  first_decode = first_decode.log_softmax(-1)

  # 1 x batch_size x beam_size
  scores, indeces = first_decode.topk(beam_size) 

  # src_len x batch_size*beam_size x hidden_size
  encoder_outputs = encoder_outputs.unsqueeze(2).repeat(1,1,beam_size,1).reshape(src.shape[0], batch_size*beam_size, -1)

  # src_len x batch_size*beam_size
  src = src.unsqueeze(2).repeat(1,1,beam_size).reshape(src.shape[0], batch_size*beam_size)

  for i in range(1, max_len):
    input_trg = indeces.view(indeces.shape[0], batch_size * beam_size )
    
    tgt_mask = (model.generate_square_subsequent_mask(indeces.shape[0])
                    .type(torch.bool)).to(device)
    dec_out = model.decode( input_trg, encoder_outputs, tgt_mask)

    indeces, scores, beam_ids = beam_search(dec_out[-1].unsqueeze(0), indeces, scores, beam_size, model.pad_idx, model.eos_idx)

  return indeces, scores

In [22]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math


# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 pad_idx: int,
                 eos_idx: int,
                 device,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.device = device
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,

                *args):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = self.create_mask(src, trg)
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor, src_padding_mask: Tensor = None):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask, src_padding_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor, tgt_padding_mask: Tensor = None):
        outs =  self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask, tgt_padding_mask)
        return self.generator(outs)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones((sz, sz), device=self.device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask


    def create_mask(self, src, tgt):
        src_seq_len = src.shape[0]
        tgt_seq_len = tgt.shape[0]

        tgt_mask = self. generate_square_subsequent_mask(tgt_seq_len)
        src_mask = torch.zeros((src_seq_len, src_seq_len),device=self.device).type(torch.bool)

        src_padding_mask = (src == self.pad_idx).transpose(0, 1)
        tgt_padding_mask = (tgt == self.pad_idx).transpose(0, 1)
        return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


In [7]:
SEED = 0


random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

input = '/kaggle/input/title-generation'
output ='/kaggle/working'

train_data_path = "/train_data.txt"
model_path = "/yttm.model"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
df = pd.read_csv(f'{input}/train.csv')
df.to_csv(f'{output}{train_data_path}', header=None, index=None, sep=' ', mode='a')
_, all = zip(*df.iterrows())
all = list(all)
TRAIN_SPLIT = int(len(all) * 0.9)
np.random.shuffle(all)
train_iter = all[:TRAIN_SPLIT]
val_iter = all[TRAIN_SPLIT:]
all = None


In [9]:
# Training model
yttm.BPE.train(data=f'{output}{train_data_path}', vocab_size=20000, model=f'{output}{model_path}')

# Loading model
bpe = yttm.BPE(model=f'{output}{model_path}')

Training parameters
  input: /kaggle/working/train_data.txt
  model: /kaggle/working/yttm.model
  vocab_size: 20000
  n_threads: 2
  character_coverage: 1
  pad: 0
  unk: 1
  bos: 2
  eos: 3

reading file...
learning bpe...
number of unique characters in the training data: 69
number of deleted characters: 0
number of unique characters left: 69
id: 1000=5+182                freq: 12976       subword: eff=e+ff
id: 2000=1101+94              freq: 5056        subword: ▁basis=▁bas+is
id: 3000=8+20                 freq: 2749        subword: af=a+f
id: 4000=10+6                 freq: 1743        subword: nt=n+t
id: 5000=85+455               freq: 1234        subword: enced=en+ced
id: 6000=219+3792             freq: 925         subword: ▁seems=▁se+ems
id: 7000=681+110              freq: 705         subword: ▁valuation=▁valu+ation
id: 8000=1293+27              freq: 562         subword: ▁task,=▁task+,
id: 9000=88+22                freq: 459         subword: ▁pb=▁p+b
id: 10000=329+9008          

In [10]:
from typing import List

SRC_LANGUAGE = 'abs'
TGT_LANGUAGE = 'title'

ln_pair = (SRC_LANGUAGE, TGT_LANGUAGE)

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

transform = sequential_transforms(
    bpe.encode,
    TensorTransform(BOS_IDX, EOS_IDX)
)

text_transform = {}
for ln in ln_pair:
    text_transform[ln] = transform



In [11]:
train_dataset = Sec2SecDataset( train_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
train_iterator = DataLoader(train_dataset, batch_size=64, shuffle = True, collate_fn = train_dataset.pad_collate_fn)

val_dataset = Sec2SecDataset( val_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
valid_iterator = DataLoader(val_dataset, batch_size=64, collate_fn = train_dataset.pad_collate_fn)

In [12]:
#Процент встречаемости слов из заголовка в исходном тексте
res = [sum(1 for t in title if t in a) /len(title) for a, title in  train_dataset]
sum(res)/len(res)

0.7902344797268247

In [34]:
INPUT_DIM = bpe.vocab_size()
OUTPUT_DIM = INPUT_DIM

EMB_SIZE =300
N_HEAD = 2

model = Seq2SeqTransformer(
                 2,
                 2,
                 EMB_SIZE,
                 N_HEAD,
                 INPUT_DIM,
                 OUTPUT_DIM,
                 PAD_IDX,
                 EOS_IDX,
                 device,
                 dim_feedforward = 512,
                 dropout= 0.1).to(device)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Модель содержит {count_parameters(model):,} параметров')

Модель содержит 21,426,448 параметров


In [36]:
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer)


In [37]:
N_EPOCHS = 12
CLIP = float('inf')
best_valid_loss = float('inf')
model.forward_mode = 'next_word'

for epoch in tqdm(range(N_EPOCHS)):


    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, device)
    valid_loss = evaluate(model, valid_iterator, criterion, device)
    
    
    sched.step(valid_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'translate_model.pt')
    
    print(f'Перплексия (обучение): {math.exp(train_loss):7.3f}')
    print(f'Перплексия (валидация): {math.exp(valid_loss):7.3f}')

  8%|▊         | 1/12 [04:26<48:47, 266.09s/it]

Перплексия (обучение): 778.618
Перплексия (валидация): 395.535


 17%|█▋        | 2/12 [08:52<44:19, 266.00s/it]

Перплексия (обучение): 319.875
Перплексия (валидация): 236.348


 25%|██▌       | 3/12 [13:18<39:57, 266.36s/it]

Перплексия (обучение): 201.577
Перплексия (валидация): 162.606


 33%|███▎      | 4/12 [17:45<35:31, 266.45s/it]

Перплексия (обучение): 137.516
Перплексия (валидация): 119.283


 42%|████▏     | 5/12 [22:11<31:04, 266.38s/it]

Перплексия (обучение):  98.658
Перплексия (валидация):  94.817


 50%|█████     | 6/12 [26:38<26:38, 266.47s/it]

Перплексия (обучение):  74.139
Перплексия (валидация):  78.969


 58%|█████▊    | 7/12 [31:04<22:12, 266.53s/it]

Перплексия (обучение):  57.869
Перплексия (валидация):  68.135


 67%|██████▋   | 8/12 [35:31<17:46, 266.57s/it]

Перплексия (обучение):  46.413
Перплексия (валидация):  60.671


 75%|███████▌  | 9/12 [39:58<13:20, 266.74s/it]

Перплексия (обучение):  38.056
Перплексия (валидация):  54.624


 83%|████████▎ | 10/12 [44:25<08:53, 266.82s/it]

Перплексия (обучение):  31.793
Перплексия (валидация):  50.719


 92%|█████████▏| 11/12 [48:52<04:26, 266.75s/it]

Перплексия (обучение):  26.976
Перплексия (валидация):  47.028


100%|██████████| 12/12 [53:18<00:00, 266.54s/it]

Перплексия (обучение):  23.210
Перплексия (валидация):  44.371





In [38]:
model.load_state_dict(torch.load('translate_model.pt', map_location=device))

<All keys matched successfully>

## Titiling

In [41]:

def predict_with_model(model, iterator, vocab,  device = None):
    model.eval()
    device = model.device
    condidate_corpus = []
    ref_corpus = []

    for batch in iterator:
      with torch.no_grad():
          src, trg = data_to_device(batch, device)
          pred_trg, scores = beam_search_transformer(model,src, beam_size=5, max_len=20)
          pred_trg = pred_trg[:,:,0]

          #pred_trg = model(src, trg, 0)
          #pred_trg = pred_trg.argmax(-1)
          
          for i in range(src.shape[1]):
              candidat = pred_trg[:,i][(pred_trg[:,i] != model.eos_idx) & (pred_trg[:,i] != model.pad_idx) ]
              candidat = vocab.decode([list(candidat.cpu().numpy())])[0].split()

              ref = trg[1:,i][(trg[1:,i] != model.eos_idx) & (trg[1:,i] != model.pad_idx) ]
              ref = vocab.decode([list(ref.cpu().numpy())])[0].split()

              condidate_corpus.append(candidat)
              ref_corpus.append([ref])
    return condidate_corpus, ref_corpus


In [42]:
model.forward_mode = 'greedy'
condidat_corpus, ref_corpus = predict_with_model(model, valid_iterator, bpe)

In [43]:
bleu_score(condidat_corpus, ref_corpus, max_n=3, weights=[0.34, 0.33, 0.33])

0.10517405718564987

In [44]:
for i in range(10):
    print(' '. join(ref_corpus[i][0]))
    print(' '.join(condidat_corpus[i]))
    print('')
    

nonconvex proximal splitting: batch and incremental algorithms
global optimization of nonconvex composite optimization and

a time-varying shared frailty model with application to infectious diseases
a parametric model for network analysis with applications in network epidemiology

empirical quantile clts for time dependent data
empirical quantile processes for random measures of a stochastic process for

asymptotic normality of the quasi maximum likelihood estimator for multidimensional causal processes
quasi maximum likelihood estimation for multidimensional causal processes processes

multi-reference video coding using stillness detection
group coding for classification of video and

microbial mutualism at a distance: the role of geometry in diffusive exchanges
the role of competition in a model of microbial communities during the development of a species

visualization of gene expression information within the context of the mouse anatomy
visualizations of the atlas and sunspot are

In [None]:
submission_data = pd.read_csv('/kaggle/input/title-generation/test.csv')
abstracts = submission_data['abstract'].values
test_iter = [(a, 'a') for a in abstracts]
test_dataset = Sec2SecDataset( test_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
test_iterator = DataLoader(test_dataset, batch_size=30, collate_fn = test_dataset.pad_collate_fn)

test_preds, _ = predict_with_model(model, test_iterator, bpe)

In [None]:

titles = []
for t_pred in test_preds:

    #title, _ = translate_sentence(model, abstract.split())
    titles.append(' '.join(t_pred).replace('<unk>', ''))
    
submission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
submission_df.to_csv('predicted_titles.csv', index=False)

In [None]:
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='/kaggle/input/title-generation/vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        try:
            trg = row['title']
            trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
            trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])

            VOCAB_stoi = vocs[row_idx]
            trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
            trg_vec = np.zeros(len(VOCAB_stoi))    

            for word in trg_intersection:
                trg_vec[VOCAB_stoi[word]] = 1

            with open(output_file, 'a') as res_file:
                for is_word in trg_vec:
                    res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                    output_idx += 1
        except:
            continue


generate_csv()