<a href="https://colab.research.google.com/github/ArtemNechaev/stepik_nnets/blob/main/task8_translate_de2en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/ArtemNechaev/stepik_nnets 

Cloning into 'stepik_nnets'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 129 (delta 56), reused 68 (delta 24), pack-reused 0[K
Receiving objects: 100% (129/129), 2.37 MiB | 2.76 MiB/s, done.
Resolving deltas: 100% (56/56), done.


In [45]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from collections import OrderedDict, Counter

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, FastText, vocab
from torchtext.datasets import Multi30k
from typing import Iterable, List

from stepik_nnets.sec2sec.data import Sec2SecDataset, sequential_transforms, TensorTransform, myFastText
from stepik_nnets.sec2sec.models.only_gru import OnlyGRU 

from stepik_nnets.sec2sec.engine import train, evaluate, predict_with_model, data_to_device
#from stepik_nnets.sec2sec.beam_search import  beam_search_rnn

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy

import random
import math
import time
from tqdm import tqdm
import pickle

import zipfile
#import gensim


ModuleNotFoundError: No module named 'stepik_nnets.sec2sec.beam_search'

In [14]:
SEED = 0
pretrained_embed = False


random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
df = pd.read_csv('/kaggle/input/title-generation/train.csv')
_, all = zip(*df.iterrows())
all = list(all)
TRAIN_SPLIT = int(len(all) * 0.9)
np.random.shuffle(all)
train_iter = all[:TRAIN_SPLIT]
val_iter = all[TRAIN_SPLIT:]
all = None


In [9]:
import re


TOKEN_RE = re.compile(r'[\w\d]+')

def tokenize_text_simple_regex(txt, min_token_size=1):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

In [10]:

SRC_LANGUAGE = 'abs'
TGT_LANGUAGE = 'title'

ln_pair = (SRC_LANGUAGE, TGT_LANGUAGE)

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.

token_transform[TGT_LANGUAGE] = tokenize_text_simple_regex#get_tokenizer('spacy', language='en_core_web_lg')
token_transform[SRC_LANGUAGE] = tokenize_text_simple_regex#get_tokenizer('spacy', language='en_core_web_lg')


# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']


tokenized_train = ((token_transform[SRC_LANGUAGE](t[0]) , token_transform[TGT_LANGUAGE](t[1]))
    for t in train_iter)

train_iter_src, train_iter_trg = zip(*tokenized_train)


if pretrained_embed:
  ft_embed = myFastText(language='en')
  s_dict = {s:i for i, s in enumerate( special_symbols)} 
  ft_embed.stoi = { w: i for i, (w, _) in  enumerate(list(s_dict.items()) + list(ft_embed.stoi.items()))}
  vocab_transform[SRC_LANGUAGE] = vocab(ft_embed.stoi, min_freq=0)
else:
  vocab_transform[SRC_LANGUAGE] = build_vocab_from_iterator(train_iter_src,
                                                      min_freq=1,
                                                      specials=special_symbols,
                                                      special_first=True)

vocab_transform[TGT_LANGUAGE] = vocab_transform[SRC_LANGUAGE]



    # Set UNK_IDX as the default index. This index is returned when the token is not found.
    # If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.

vocab_transform[SRC_LANGUAGE].set_default_index(UNK_IDX)
vocab_transform[TGT_LANGUAGE].set_default_index(UNK_IDX)


# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in ln_pair:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               TensorTransform(BOS_IDX, EOS_IDX)) # Add BOS/EOS and create tensor



In [12]:
train_dataset = Sec2SecDataset( train_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
train_iterator = DataLoader(train_dataset, batch_size=30, shuffle = True, collate_fn = train_dataset.pad_collate_fn)

val_dataset = Sec2SecDataset( val_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
valid_iterator = DataLoader(val_dataset, batch_size=30, collate_fn = train_dataset.pad_collate_fn)

In [23]:
#tgt_stoi = vocab_transform[TGT_LANGUAGE].get_stoi()
#src2trg = torch.zeros(len(vocab_transform[SRC_LANGUAGE]), dtype=torch.long, device = device)
#for w, i in tqdm(vocab_transform[SRC_LANGUAGE].get_stoi().items()):
    #src2trg[i] = tgt_stoi.get(w, 0)

class Model(OnlyGRU):
    def __init__(self, *args, **kwargs ):
        super().__init__(*args, **kwargs)
        
    def forward(self, src, trg, teacher_forcing_ratio, *args):
        outputs = super().forward(src, trg, teacher_forcing_ratio, *args)
        mask = torch.zeros(outputs.shape, device=self.device , dtype = torch.float)
        mask += 0.3
        for b in range(mask.shape[1]):
            mask[:, b, torch.unique(src2trg[src[:,b]])] = 1
        outputs = outputs* mask
        return(outputs)
        

In [24]:
INPUT_DIM = len(vocab_transform[SRC_LANGUAGE])
OUTPUT_DIM = len(vocab_transform[TGT_LANGUAGE])

EMB_SIZE = 100
HID_SIZE = 50

src_embed = nn.Embedding(len(vocab_transform[SRC_LANGUAGE]), EMB_SIZE, padding_idx=PAD_IDX)
if pretrained_embed:
  
  src_embed.load_state_dict({'weight': torch.cat([src_embed.weight[:4].data, ft_embed.vectors])})
  ft_embed = None
  src_embed.requires_grad_(False)


model = Model(INPUT_DIM, OUTPUT_DIM, EMB_SIZE, HID_SIZE, 2,  PAD_IDX, EOS_IDX, device, src_embed=None, trg_embed=None).to(device)

In [25]:
if not pretrained_embed:
  def init_weights(m): 
    for name, param in m.named_parameters(): 
      if 'weight' in name: 
        nn.init.normal_(param.data, mean=0, std=0.01) 
      else: nn.init.constant_(param.data, 0)

  model.apply(init_weights)

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Модель содержит {count_parameters(model):,} параметров')

Модель содержит 37,404,584 параметров


In [27]:
optimizer = optim.Adam(model.parameters(), lr =2e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer)


In [41]:
N_EPOCHS = 12
CLIP = float('inf')
best_valid_loss = float('inf')
model.forward_mode = 'next_word'

for epoch in tqdm(range(N_EPOCHS)):


    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, device)
    valid_loss = evaluate(model, valid_iterator, criterion, device)
    
    
    sched.step(valid_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'translate_model.pt')
    
    print(f'Перплексия (обучение): {math.exp(train_loss):7.3f}')
    print(f'Перплексия (валидация): {math.exp(valid_loss):7.3f}')

  2%|▏         | 1/50 [10:39<8:42:16, 639.52s/it]

Перплексия (обучение):  84.256
Перплексия (валидация): 691.736


  4%|▍         | 2/50 [21:17<8:30:41, 638.37s/it]

Перплексия (обучение):  61.274
Перплексия (валидация): 618.827


  4%|▍         | 2/50 [25:54<10:21:57, 777.44s/it]


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('translate_model.pt', map_location=device))

In [None]:
N_EPOCHS = 5
CLIP = float('inf')
best_valid_loss = float('inf')
model.forward_mode = 'greedy'

for epoch in tqdm(range(N_EPOCHS)):


    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, device)
    valid_loss = evaluate(model, valid_iterator, criterion, device)
    
    
    sched.step(valid_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'translate_model.pt')
    
    print(f'Перплексия (обучение): {math.exp(train_loss):7.3f}')
    print(f'Перплексия (валидация): {math.exp(valid_loss):7.3f}')

In [None]:
model.load_state_dict(torch.load('translate_model.pt', map_location=device))

## Titiling

In [42]:

    
def data_to_device (data, device):
    if isinstance(data, Sequence):
        data = (d.to(device) for d in data)
    else:
        data = data.to(device)
    return data

def predict_with_model(model, iterator, vocab: Vocab,  device = None):
    model.eval()
    device = model.device
    condidate_corpus = []
    ref_corpus = []

    for batch in iterator:
      with torch.no_grad():
          src, trg = data_to_device(batch, device)
          #pred_trg, scores = beam_search_rnn(model,src, beam_size=3, max_len=20, device=device)
          #pred_trg = pred_trg[:,:,0]

          pred_trg = model(src, trg, 0)
          pred_trg = pred_trg.argmax(-1)
          
          for i in range(src.shape[1]):
              candidat = pred_trg[:,i][(pred_trg[:,i] != model.eos_idx) & (pred_trg[:,i] != model.pad_idx) ]
              candidat = vocab.lookup_tokens(list(candidat.cpu().numpy()))

              ref = trg[:,i][(trg[:,i] != model.eos_idx) & (trg[:,i] != model.pad_idx) & (trg[:,i] != 2) ]
              ref = vocab.lookup_tokens(list(ref.cpu().numpy()))

              condidate_corpus.append(candidat)
              ref_corpus.append([ref])
    return condidate_corpus, ref_corpus


In [43]:
model.forward_mode = 'greedy'
condidat_corpus, ref_corpus = predict_with_model(model, valid_iterator, vocab_transform['title'])

In [44]:
bleu_score(condidat_corpus, ref_corpus, max_n=3, weights=[0.34, 0.33, 0.33])

IndexError: index 3 is out of bounds for dimension 0 with size 3

In [None]:
for i in range(10):
    print(' '. join(ref_corpus[i][0]))
    print(' '.join(condidat_corpus[i]))
    print('')
    

In [35]:
submission_data = pd.read_csv('/kaggle/input/title-generation/test.csv')
abstracts = submission_data['abstract'].values
test_iter = [(a, 'a '*20) for a in abstracts]
test_dataset = Sec2SecDataset( test_iter, text_transform, PAD_IDX, ln_pair = ln_pair)
test_iterator = DataLoader(test_dataset, batch_size=30, collate_fn = test_dataset.pad_collate_fn)

test_preds, _ = predict_with_model(model, test_iterator, vocab_transform['title'])

In [36]:

titles = []
for t_pred in test_preds:

    #title, _ = translate_sentence(model, abstract.split())
    titles.append(' '.join(t_pred).replace('<unk>', ''))
    
submission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
submission_df.to_csv('predicted_titles.csv', index=False)

In [39]:
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='/kaggle/input/title-generation/vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        try:
            trg = row['title']
            trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
            trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])

            VOCAB_stoi = vocs[row_idx]
            trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
            trg_vec = np.zeros(len(VOCAB_stoi))    

            for word in trg_intersection:
                trg_vec[VOCAB_stoi[word]] = 1

            with open(output_file, 'a') as res_file:
                for is_word in trg_vec:
                    res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                    output_idx += 1
        except:
            continue


generate_csv()