Задача 1. Распознавания именованных сущностей на основе fasttext
----------------------------------------------------------------

### Prerequisites
[Crawl vectors](https://fasttext.cc/docs/en/crawl-vectors.html)
\
[How to load model](https://fasttext.cc/docs/en/crawl-vectors.html)

### Imports

In [None]:
! pip install lime nerus fasttext # necessary libs in case not installed

In [2]:
# fasttext (!)
import fasttext
import fasttext.util

# necessary utils 
from nerus import load_nerus
from pathlib import Path
from itertools import zip_longest
import numpy as np

# for progress bar
from tqdm.notebook import tqdm

# ml
from sklearn.metrics import classification_report
import torch
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

2024-05-10 18:54:49.046552: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-10 18:54:49.046592: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Loading NERUS dataset

In [4]:
nerus_db_location = Path('./').resolve() / 'nerus_lenta.conllu.gz' # I assume it's in the root of working directory
assert nerus_db_location.exists() # check if file exists
nerus_db = load_nerus(nerus_db_location) # loading db as in example

### Token structure

In [5]:
next_text = next(nerus_db)

token = next_text.sents[int(np.random.randint(6))].tokens[int(np.random.randint(6))]
token

NerusToken(
    id='3',
    text='социальным',
    pos='ADJ',
    feats={'Case': 'Dat',
     'Degree': 'Pos',
     'Number': 'Plur'},
    head_id='4',
    rel='amod',
    tag='O'
)

### Getting 5000 tokens

In [6]:
TOKEN_LIST_SIZE = 5000

sents = []
tags = []
for _ in tqdm(range(5000)):
    next_test = next(nerus_db)
    
    for sent in next_test.sents:
        sent_toks,sent_tags = zip(*[(tok.text, tok.pos) for tok in sent.tokens])
        sents.append(sent_toks)
        tags.append(sent_tags)
        
print(*sents[:8], sep='\n')

  0%|          | 0/5000 [00:00<?, ?it/s]

('Австрийские', 'правоохранительные', 'органы', 'не', 'представили', 'доказательств', 'нарушения', 'российскими', 'биатлонистами', 'антидопинговых', 'правил', '.')
('Об', 'этом', 'сообщил', 'посол', 'России', 'в', 'Вене', 'Дмитрий', 'Любинский', 'по', 'итогам', 'встречи', 'уполномоченного', 'адвоката', 'дипмиссии', 'с', 'представителями', 'прокуратуры', 'страны', ',', 'передает', 'ТАСС', '.')
('«', 'Действует', 'презумпция', 'невиновности', '.')
('Каких-либо', 'ограничений', 'свободы', 'передвижения', 'для', 'команды', 'нет', '»', ',', '—', 'добавили', 'в', 'посольстве', '.')
('Международный', 'союз', 'биатлонистов', '(', 'IBU', ')', 'также', 'не', 'будет', 'применять', 'санкции', 'к', 'российским', 'биатлонистам', '.')
('Все', 'они', 'продолжат', 'выступление', 'на', 'Кубке', 'мира', '.')
('Полиция', 'нагрянула', 'в', 'отель', 'сборной', 'России', 'в', 'Хохфильцене', 'вечером', '12', 'декабря', '.')
('Как', 'написал', 'биатлонист', 'Александр', 'Логинов', ',', 'их', 'считают', 'виновн

### Utility function

In [7]:
import math 

def list_columns(obj, cols=4, columnwise=True, gap=4):
    sobj = [str(item) for item in obj]
    if cols > len(sobj): cols = len(sobj)
    max_len = max([len(item) for item in sobj])
    if columnwise: cols = int(math.ceil(float(len(sobj)) / float(cols)))
    plist = [sobj[i: i+cols] for i in range(0, len(sobj), cols)]
    if columnwise:
        if not len(plist[-1]) == cols:
            plist[-1].extend(['']*(len(sobj) - len(plist[-1])))
        plist = zip(*plist)
    printer = '\n'.join([
        ''.join([c.ljust(max_len + gap) for c in p])
        for p in plist])
    print(printer)
    
def list_dict(freqs):	
	for k, v in freqs.items():
		print(f'{k:<4} {v}')

### Splitting to train / test and counting unique tags

In [8]:
numpyarr_sents = np.array(list(zip_longest(*sents, fillvalue=""))).T
numpyarr_tags  = np.array(list(zip_longest(*tags, fillvalue="[PAD]"))).T

mask = np.random.rand(len(sents)) < 0.7

# as in examples
X_train = numpyarr_sents[mask]
y_train = numpyarr_tags[mask]
X_test = numpyarr_sents[~mask]
y_test = numpyarr_tags[~mask]

# making train and test
d_train = [X_train, y_train]
d_test  = [X_test, y_test]

# make unique tags
unique_tags = list(sorted(set([''] + [target_tag for setn in y_train for target_tag in setn])))
tags_in_y_train = [target_tag for y in y_train for target_tag in y]

# count all
tags_cnt = dict()
for target_tag in set(tags_in_y_train):
    tags_cnt[target_tag] = tags_in_y_train.count(target_tag)

# shortened arrays
print(unique_tags)
print("\n\n")
list_columns(tags_cnt, gap=6)

['', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', '[PAD]']



ADP        PRON       ADJ        DET        
NOUN       NUM        PUNCT      PART       
SYM        AUX        INTJ       X          
[PAD]      VERB       ADV                   
PROPN      SCONJ      CCONJ                 


### Loading fasttext model

In [9]:
# fasttext.util.download_model('ru', if_exists='ignore')
original_model = fasttext.load_model('cc.ru.300.bin')
# saving space
reduced_model = fasttext.util.reduce_model(original_model, 100)



In [10]:
word_to_index, id_to_word = dict(), dict()

fst_arr = list()
for idx, word in enumerate(tqdm(reduced_model.get_words(on_unicode_error='replace'))):
    word_vector = reduced_model.get_word_vector(word)
    # check  if we didnt add word yet
    if word not in word_to_index:
        fst_arr.append(word_vector)
        word_to_index[word], id_to_word[idx] = idx, word

# add [PAD] fields
word_to_index['[PAD]']   = len(word_to_index)
word_to_index['[UNK]'] = len(word_to_index)

for _ in range(2):
	fst_arr.append(np.zeros_like(fst_arr[-1]))

# add ids
id_to_word[len(id_to_word)] = '[UNK]'
id_to_word[len(id_to_word)] = '[PAD]'

# making tags/id and id/tags dicts
tag_to_id = {tag : i for (i, tag) in enumerate(unique_tags)}
id_to_tag = {i : tag for (i, tag) in enumerate(unique_tags)}

#print to check
list_dict(tag_to_id)
print("\n\n")
list_dict(id_to_tag)

  0%|          | 0/2000000 [00:00<?, ?it/s]

     0
ADJ  1
ADP  2
ADV  3
AUX  4
CCONJ 5
DET  6
INTJ 7
NOUN 8
NUM  9
PART 10
PRON 11
PROPN 12
PUNCT 13
SCONJ 14
SYM  15
VERB 16
X    17
[PAD] 18



0    
1    ADJ
2    ADP
3    ADV
4    AUX
5    CCONJ
6    DET
7    INTJ
8    NOUN
9    NUM
10   PART
11   PRON
12   PROPN
13   PUNCT
14   SCONJ
15   SYM
16   VERB
17   X
18   [PAD]


## Probing model performance

## Make dataloader

Make a NERUSStorage class suitable for easy work with dataloader

In [11]:
class NERUSstorage(Dataset):
    def __init__(self, dataset, token_to_index, tag_to_id):
        self.x_data, self.y_data, self.t2i, self.tg2i  = dataset[0], dataset[1], token_to_index, tag_to_id

    def __len__(self):
        return len(self.x_data)
    
    def __str__(self):
        return str(self.x_data)
    
    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]

# make dataset into array of NERUSstorages
total_len = len(d_train[0][0])

x_train_u, y_train_u = list(), list()
for idx in tqdm(range(len(d_train[0]))):
    x_train_u.append(torch.LongTensor([word_to_index.get(tag, word_to_index['[UNK]']) for tag in d_train[0][idx]]))
    y_train_u.append(torch.LongTensor([tag_to_id.get(tag, word_to_index['X']) for tag in d_train[1][idx]]))
    
# getting the dataset
dataset = NERUSstorage([x_train_u, y_train_u], word_to_index, tag_to_id)
test_dataset = NERUSstorage([x_train_u, y_train_u], word_to_index, tag_to_id)

  0%|          | 0/41196 [00:00<?, ?it/s]

In [12]:
# getting dataloader with batches of size 64
dl_train = DataLoader(dataset,      batch_size=64, shuffle=True)
dl_test  = DataLoader(test_dataset, batch_size=64, shuffle=True)

### Training 

In [13]:
# let's take our already prepared functions and classes
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()
    
    prediction = model(batch_of_x.to(model.device)).transpose(1,2)
    loss = loss_function(prediction.to(model.device), batch_of_y.to(model.device))
    loss.backward()
    
    optimizer.step()
    
    return loss.cpu().item()

def train_epoch(train_generator, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch( model, batch_of_x, batch_of_y, optimizer, loss_function)
        train_generator.set_postfix({'train batch loss': local_loss})

        if callback is not None:
            callback(model, local_loss)

        epoch_loss += local_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

def trainer(count_of_epoch, batch_size, model,
            dataset, tag2idx, token2idx,
            loss_function, optimizer, callback):
    
    iterations = tqdm(range(count_of_epoch))
    for it in iterations:
        optima = optimizer

        number_of_batch = len(dataset[0]) // batch_size + (len(dataset[0])%batch_size > 0)
        batch_generator = tqdm(dl_train)
        
        epoch_loss = train_epoch( train_generator = batch_generator, 
                                  model = model, 
                                  loss_function = loss_function, 
                                  optimizer = optima,
                                  callback = callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})
    
class callback():
    def __init__(self, writer, dataset, tag2idx, token2idx, loss_function, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.tag2idx = tag2idx
        self.token2idx = token2idx

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 1:
            
            batch_generator = dl_test
            
            pred = []
            real = []
            
            test_loss = 0
            model.eval()
            for it, (batch_of_x, batch_of_y) in enumerate(batch_generator):
                batch_of_x = batch_of_x.to(model.device)
                batch_of_y = batch_of_y.to(model.device)
                
                with torch.no_grad():
                    out = model(batch_of_x.to(model.device))
                    test_loss += self.loss_function(out.transpose(1,2), batch_of_y).cpu().item()*len(batch_of_x)

                pred.extend(torch.argmax(out, dim=-1).cpu().numpy().tolist())
                real.extend(batch_of_y.cpu().numpy().tolist())

            test_loss /= len(self.dataset[0])
            self.writer.add_scalar('LOSS/test', test_loss, self.step)
            
            ans=[]
            my_ans=[]
            for (y, my) in zip(real, pred):
                for (i, idx) in enumerate(y):
                    if idx != tag_to_id['[PAD]']:
                        ans.append(id_to_tag[idx])
                        my_ans.append(id_to_tag[my[i]])

            self.writer.add_text('classification_report/test', str(classification_report(ans, my_ans)), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)
    
class RNN(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
        
    def __init__(self,
                 vocab_dim,
                 output_dim,
                 emb_dim = 20,
                 hidden_dim = 20, 
                 num_layers = 3,
                 dropout = 0,
                 bnorm = False,
                 bidirectional = False):
        super(RNN, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.bnorm = bnorm

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        
        self.classifier = torch.nn.Linear(hidden_dim, output_dim)

        self.batchnorm = torch.nn.BatchNorm1d(hidden_dim)
        
    def forward(self, input):
        out = self.embedding(input) 
        out, (h, c) = self.encoder(out)
        if self.bnorm:
            out = self.batchnorm(torch.transpose(out, 1, 2))
            out = torch.transpose(out, 1, 2)
        return self.classifier(out)

### Initializing model

In [14]:
model = RNN(**{
    'bnorm': True,
    'num_layers': 1,
    'emb_dim' : 100,
    'output_dim': len(unique_tags),
    'vocab_dim': len(word_to_index),
    'hidden_dim': 100,
    'bidirectional': False}
)

# put model to cuda cores (or cpu)
_ = model.to(device)
model.embedding.weight.data.copy_(torch.tensor(fst_arr))

# no grad for params
for param in model.embedding.parameters():
    param.requires_grad = False

# put real model to cuda
model.to(device)

RNN(
  (embedding): Embedding(2000002, 100)
  (encoder): LSTM(100, 100, batch_first=True)
  (classifier): Linear(in_features=100, out_features=19, bias=True)
  (batchnorm): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

### Testing the model

Let's write `TestModel` for probing the model results.

In [15]:
def TestModel(model, generator, tag_to_id, id_to_tag):
    predictions, ys = list(), list()
    model.eval()
    
    for _, batch in enumerate(generator):
        x_batch, y_batch = batch[0].to(device), batch[1].cpu().numpy().tolist()
        with torch.no_grad():
            prediction = torch.argmax(model(x_batch), dim=-1)
        out = prediction.cpu().numpy().tolist()
        predictions.extend(out)
        ys.extend(y_batch)

    result=[]
    expected_result=[]
    
    for (y, prediction) in zip(ys, predictions):
        for (id, idx) in enumerate(y):
            if idx != tag_to_id['[PAD]']:
                result.append(id_to_tag[idx])
                expected_result.append(id_to_tag[prediction[id]])
    
    print(classification_report(result, expected_result))

### Probe before training

In [16]:
TestModel(model, dl_train, tag_to_id, id_to_tag)

              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00     66919
         ADP       0.00      0.00      0.00     84882
         ADV       0.03      0.61      0.05     19495
         AUX       0.00      0.00      0.00      4896
       CCONJ       0.00      0.01      0.00     17486
         DET       0.01      0.00      0.00     11409
        INTJ       0.00      0.00      0.00        29
        NOUN       0.06      0.00      0.00    203611
         NUM       0.00      0.00      0.00     14558
        PART       0.00      0.00      0.00      8957
        PRON       0.02      0.15      0.04     26351
       PROPN       0.00      0.00      0.00     53279
       PUNCT       0.00      0.00      0.00    130097
       SCONJ       0.00      0.00      0.00     12357
         SYM       0.00      0.00      0.00       191
        VERB       0.03      0.00      0.00     85192
           X       0.00      0.00      0.00      7255

    accuracy              

### Training process
Let's train model with `AdamW` and `CEL` loss function. Note how we are ignoring IDs of `'[PAD]'` since those shouldn't be in the sentences.

In [17]:
loss = torch.nn.CrossEntropyLoss(ignore_index=tag_to_id['[PAD]'])
# for all parameters no grad is required
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
# setup writer
writer = SummaryWriter(log_dir = f'./log')


# let's try training
cb_result = callback(writer, d_test, tag_to_id, word_to_index, loss, delimeter = 150)
trainer(count_of_epoch=4, batch_size=64,
        dataset=d_train, model=model,
        tag2idx=tag_to_id, token2idx=word_to_index,
        loss_function=loss, optimizer=opt,
        callback=cb_result)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

### Probe after training

In [18]:
TestModel(model, dl_test, tag_to_id, id_to_tag)

              precision    recall  f1-score   support

         ADJ       0.93      0.93      0.93     66919
         ADP       1.00      1.00      1.00     84882
         ADV       0.94      0.89      0.92     19495
         AUX       0.93      0.99      0.96      4896
       CCONJ       0.98      0.99      0.98     17486
         DET       0.86      0.85      0.86     11409
        INTJ       0.58      0.24      0.34        29
        NOUN       0.98      0.98      0.98    203611
         NUM       0.94      0.93      0.94     14558
        PART       0.97      0.90      0.93      8957
        PRON       0.94      0.94      0.94     26351
       PROPN       0.92      0.95      0.93     53279
       PUNCT       1.00      1.00      1.00    130097
       SCONJ       0.93      0.98      0.96     12357
         SYM       0.66      0.87      0.75       191
        VERB       0.97      0.96      0.97     85192
           X       0.88      0.82      0.85      7255

    accuracy              

# Loading tensorboard

In [23]:
%load_ext tensorboard
%tensorboard --logdir './log'

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 3585901), started 0:06:20 ago. (Use '!kill 3585901' to kill it.)

# Results

- `Fasttext` achieves incredible results even since first epoch.
- Pre-trained model is better for results.
- `reduce_model` is extremely heavy and requires more than 30gb of RAM in this case.
- `Fasttext` is good if you can run in on cluster.