# Домашнее задание 2-1. Задача 2. Анализ модели LSTM.

Выполнил: Чемров Кирилл Сергеевич, группа Б01-811

Преподаватели: Грабовой Андрей Валериевич, Воронцов Константин Вячеславович

In [None]:
!pip install lime
!pip install nerus

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[K     |████████████████████████████████| 275 kB 3.9 MB/s 
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283857 sha256=f4be28ec8fd44d86000fa0198347bac7393a76bb3fc1bc908a24ad6eb0b9debc
  Stored in directory: /root/.cache/pip/wheels/ca/cb/e5/ac701e12d365a08917bf4c6171c0961bc880a8181359c66aa7
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Collecting nerus
  Downloading nerus-1.7.0-py3-none-any.whl (15 kB)
Installing collected packages: nerus
Successfully installed nerus-1.7.0


In [None]:
from copy import deepcopy

import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from matplotlib import gridspec
from PIL import Image
import io
from urllib.request import urlopen
from lime import lime_image
from skimage.segmentation import mark_boundaries

from tqdm.notebook import tqdm
import numpy as np
import requests
import torch

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter

from torchvision import datasets, transforms

from nerus import load_nerus

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Полезные функции для обучения и тестирования моделей

In [None]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()
    
    prediction = model(batch_of_x.to(model.device)) 

    loss = loss_function(prediction.transpose(1,2).to(model.device), batch_of_y.to(model.device))
    
    loss.backward()
    optimizer.step()
    
    return loss.cpu().item()

In [None]:
def train_epoch(train_generator, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch( model, batch_of_x, batch_of_y, optimizer, loss_function)
        train_generator.set_postfix({'train batch loss': local_loss})

        if callback is not None:
            callback(model, local_loss)

        epoch_loss += local_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

In [None]:
def batch_generator(dataset, pos2idx, token2idx,
                    batch_size=64, shuffle=False):
  
    X, Y = dataset[0], dataset[1]
    PAD_TOKEN = token2idx['<PAD>']
    PAD_POS = pos2idx['<PAD>']

    n_samples = len(X)
    # генерим список индексов
    list_of_indexes = np.arange(0, n_samples, step=1, dtype=np.int64)
    # если нужно перемешать, то перемешиваем
    if shuffle:
        np.random.shuffle(list_of_indexes)
    # по этим индексам сделаем новый перемешаный список токенов и тэгов
    List_X = [X[indx] for indx in list_of_indexes]
    List_Y = [Y[indx] for indx in list_of_indexes]

    n_batches = n_samples//batch_size + (1 if n_samples%batch_size != 0 else 0)
        
    # For each k yield pair x and y
    for k in range(n_batches):
        # указываем текущии размер батча
        this_batch_size = batch_size
    
        # если мы выдаем последний батч, то его нужно обрезать
        if (k == n_batches - 1) and (n_samples%batch_size > 0):
            this_batch_size = n_samples%batch_size
      
        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]
        This_Y = List_Y[k*batch_size:k*batch_size + this_batch_size]
        #dict.get(key, default)
        This_X_line = [ [token2idx.get(token, token2idx['<UNK>']) for token in sent] for sent in This_X]
        This_Y_line = [ [pos2idx.get(tag, token2idx['X']) for tag in sent] for sent in This_Y]

        # Дополняем предложения до максимальной длины
        length_of_sentence = np.max([len(sent) for sent in This_X_line])

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence])*PAD_TOKEN
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence])*PAD_POS

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]
            y_arr[i, :len(This_Y_line[i])] = This_Y_line[i]

        x = torch.LongTensor(x_arr)
        y = torch.LongTensor(y_arr)

        yield x, y

In [None]:
def trainer(count_of_epoch, batch_size, model,
            dataset, pos_tag2idx, token2idx,
            loss_function, optimizer, callback):
    
    iterations = tqdm(range(count_of_epoch))
    for it in iterations:
        optima = optimizer

        number_of_batch = len(dataset[0]) // batch_size + (len(dataset[0])%batch_size > 0)
        generator = tqdm( batch_generator(dataset, pos2idx, token2idx,
                                          batch_size=batch_size, shuffle=True), 
                          leave=False, total=number_of_batch)
        
        epoch_loss = train_epoch( train_generator = generator, 
                                  model = model, 
                                  loss_function = loss_function, 
                                  optimizer = optima,
                                  callback = callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

Функция callback для отслеживания обучения в TensorBoard

In [None]:
class callback():
    def __init__(self, writer, dataset, pos2idx, token2idx, loss_function, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.pos2idx = pos2idx
        self.token2idx = token2idx

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 1:
            real, pred = [], []
            number_of_batch = len(self.dataset[0]) // self.batch_size + (len(self.dataset[0])%self.batch_size > 0)

            generator = batch_generator(self.dataset, self.pos2idx, self.token2idx,
                                        batch_size=self.batch_size)
            model.eval()
            test_loss = 0

            for it, (batch_of_x, batch_of_y) in enumerate(generator):
                batch_of_x = batch_of_x.to(model.device)
                batch_of_y = batch_of_y.to(model.device)
                
                with torch.no_grad():
                    output = model(batch_of_x.to(model.device))
                    test_loss += self.loss_function(output.transpose(1,2), batch_of_y).cpu().item()*len(batch_of_x)

                pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
                real.extend(batch_of_y.cpu().numpy().tolist())

            test_loss /= len(self.dataset[0])
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            # убираем паддинг и преобразуем индексы к постегам
            pred4report = [];    real4report = []
            for (sent_real, sent_pred) in zip(real, pred):
                realWOpad = [];   predWOpad = []
                for (i, idx) in enumerate(sent_real):
                    if idx != pos2idx['<PAD>']:
                        realWOpad.append(idx2pos[idx])
                        predWOpad.append(idx2pos[sent_pred[i]])
                real4report.append(realWOpad)
                pred4report.append(predWOpad)

            flat_real = [item for sublist in real4report for item in sublist]
            flat_pred = [item for sublist in pred4report for item in sublist]
            #pos_names = [pos_tag for pos_tag in pos2idx.keys()]

            self.writer.add_text('REPORT/test', str(classification_report(flat_real, flat_pred)), self.step)
            # prediction examples
            real_exm = ' '.join(real4report[0])
            pred_exm = ' '.join(pred4report[0])
            self.writer.add_text('TEXT/test1', str(f'Real POS tags: {real_exm}') + str('\n\n') 
                                            + str(f'Predicted: {pred_exm}'), self.step)
            real_exm = ' '.join(real4report[42])
            pred_exm = ' '.join(pred4report[42])
            self.writer.add_text('TEXT/test2', str(f'Real POS tags: {real_exm}') + str('\n\n') 
                                            + str(f'Predicted: {pred_exm}'), self.step)
            real_exm = ' '.join(real4report[142])
            pred_exm = ' '.join(pred4report[142])
            self.writer.add_text('TEXT/test3', str(f'Real POS tags: {real_exm}') + str('\n\n') 
                                            + str(f'Predicted: {pred_exm}'), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)

# Выборка NERUS

Выборка представляет из себя набор размеченных новостных статей. 

Подробнее: https://natasha.github.io/nerus/ 

Будем использовать уже готовую токенизацию. Задача состоит в классификации слов по частям речи (POS TAGs). Вся выборка слишком большая, в целях экономии времени и вычислительных ресурсов возьмем 8000 новостей.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
docs = load_nerus(f'/content/drive/MyDrive/Colab Notebooks/ML/nerus_lenta.conllu.gz')

In [None]:
doc = next(docs)
print(doc.ner)

NERMarkup(text='Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сообщает РИА Новости. По словам Голиковой, чаще всего онкологические заболевания становились причиной смерти в Псковской, Тверской, Тульской и Орловской областях, а также в Севастополе. Вице-премьер напомнила, что главные факторы смертности в России — рак и болезни системы кровообращения. В начале года стало известно, что смертность от онкологических заболеваний среди россиян снизилась впервые за три года. По данным Росстата, в 2017 году от рака умерли 289 тысяч человек. Это на 3,5 процента меньше, чем годом ранее.', spans=[Span(start=36, stop=52, type='PER'), Span(start=82, stop=88, type='LOC'), Span(start=149, stop=160, type='ORG'), Span(start=172, stop=181, type='PER'), Span(start=251, stop=260, type='LOC'), Span(start=262, stop=270, type='LOC'), Span(start=272, stop=280, type='LOC'), Span(start=283, stop=301, type='LOC'), Span(s

In [None]:
sent = doc.sents[0]
print(sent.morph)

MorphMarkup(tokens=[MorphToken(text='Вице-премьер', pos='NOUN', feats={'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}), MorphToken(text='по', pos='ADP', feats={}), MorphToken(text='социальным', pos='ADJ', feats={'Case': 'Dat', 'Degree': 'Pos', 'Number': 'Plur'}), MorphToken(text='вопросам', pos='NOUN', feats={'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}), MorphToken(text='Татьяна', pos='PROPN', feats={'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}), MorphToken(text='Голикова', pos='PROPN', feats={'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}), MorphToken(text='рассказала', pos='VERB', feats={'Aspect': 'Perf', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}), MorphToken(text=',', pos='PUNCT', feats={}), MorphToken(text='в', pos='ADP', feats={}), MorphToken(text='каких', pos='DET', feats={'Case': 'Loc', 'Number': 'Plur'}), MorphToken(text='рег

In [None]:
docs = load_nerus(f'/content/drive/MyDrive/Colab Notebooks/ML/nerus_lenta.conllu.gz')
tokens = []
pos_tags = []

n_docs = 8000
for _ in range(n_docs):
    doc = next(docs)
    for sent in doc.sents:
        sent_tokens = []
        sent_pos_tags = []
        for word in sent.tokens:
            sent_tokens.append(word.text)
            sent_pos_tags.append(word.pos)
        tokens.append(sent_tokens)
        pos_tags.append(sent_pos_tags)

print(f"Number of sentences: {len(tokens)}")


Number of sentences: 95123


Разделим выборку на тестовую (20%) и обучающую (80%)

In [None]:
sents_num = len(tokens)
train_mask = (np.random.rand(sents_num) < 0.8)
tokens = np.array(tokens)
pos_tags = np.array(pos_tags)
data_train = [tokens[train_mask], pos_tags[train_mask]]
data_test = [tokens[~train_mask], pos_tags[~train_mask]]

print(f"Train size = {len(data_train[0])};   test size: {len(data_test[0])}; ")
print(f"{len(data_train[0]) + len(data_test[0])}=={sents_num}")

Train size = 76120;   test size: 19003; 
95123==95123


Создадим множество из pos тэгов

In [None]:
pos_tags = data_train[1]
pos_tags_set = set(['<PAD>']) # <PAD> - дополнение до максимальной длины предложения
for sent_pos_tags in pos_tags:
    for pos_tag in sent_pos_tags:
        pos_tags_set.add(pos_tag)
pos_tags_list = sorted(pos_tags_set)
print(f"{len(pos_tags_list)} pos tags: {pos_tags_list}")

18 pos tags: ['<PAD>', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


ADJ: adjective

ADP: adposition

ADV: adverb

AUX: auxiliary

CCONJ: coordinating conjunction

DET: determiner

INTJ: interjection

NOUN: noun

NUM: numeral

PART: particle

PRON: pronoun

PROPN: proper noun

PUNCT: punctuation

SCONJ: subordinating conjunction

SYM: symbol

VERB: verb

X: other

In [None]:
#tokens_set = set(['<PAD>', '<UNK>']) # <PAD> - дополнение до максимальной длины предложения
# for sent_tokens in tokens:
#     for token in  sent_tokens:
#         tokens_set.add(token)
# print(f"{len(tokens_set)} types of tokens")

Создадим словарь из токенов

In [None]:
tokens = data_train[0]

token2freq = {}
for sent in tokens:
    for token in sent:
        if token not in token2freq:
            token2freq[token] = 1
        else:
            token2freq[token] += 1
len(token2freq)

116765

Отсортируем словарь по частоте токенов

In [None]:
sorted_token2freq = sorted(token2freq.items(), key=lambda x: -x[1])
#frequent_tokens = [token for token in token2freq.keys() if token2freq[token] > 1]
print(sorted_token2freq[75000])

('Вижу', 1)


Ограничим размер словаря до 100000 наиболее встречающихся слов (без учета PAD и UNK)

Перезапускал ноутбук со значением размера словаря 75000.

In [None]:
#voc_size = 100000 
voc_size = 75000 
tokens_set = list(dict(sorted_token2freq[:voc_size]).keys())
tokens_set[-10:]
#последние 10

['1424',
 '1726',
 '1676',
 'разгромила',
 '41-ю',
 '49-ю',
 'Джейпи',
 'Сент-Пол',
 'Grindr',
 'управлявшего']

Создадим словари pos тэгов и токенов

In [None]:
pos2idx = {tag : i for (i, tag) in enumerate(pos_tags_list)}
idx2pos = {i : tag for (i, tag) in enumerate(pos_tags_list)}
token2idx = {token : i+2 for (i, token) in enumerate(tokens_set)}
idx2token = {i+2 : token for (i, token) in enumerate(tokens_set)}
token2idx['<PAD>'] = 0
token2idx['<UNK>'] = 1
print(pos2idx)
print(idx2pos)
print(list(token2idx.items())[:20])
print(f"(<PAD>, {token2idx['<PAD>']})")
print(f"(<UNK>, {token2idx['<UNK>']})")

{'<PAD>': 0, 'ADJ': 1, 'ADP': 2, 'ADV': 3, 'AUX': 4, 'CCONJ': 5, 'DET': 6, 'INTJ': 7, 'NOUN': 8, 'NUM': 9, 'PART': 10, 'PRON': 11, 'PROPN': 12, 'PUNCT': 13, 'SCONJ': 14, 'SYM': 15, 'VERB': 16, 'X': 17}
{0: '<PAD>', 1: 'ADJ', 2: 'ADP', 3: 'ADV', 4: 'AUX', 5: 'CCONJ', 6: 'DET', 7: 'INTJ', 8: 'NOUN', 9: 'NUM', 10: 'PART', 11: 'PRON', 12: 'PROPN', 13: 'PUNCT', 14: 'SCONJ', 15: 'SYM', 16: 'VERB', 17: 'X'}
[(',', 2), ('.', 3), ('в', 4), ('и', 5), ('«', 6), ('»', 7), ('на', 8), ('что', 9), ('—', 10), ('с', 11), ('не', 12), ('В', 13), ('по', 14), (')', 15), ('(', 16), ('из', 17), ('этом', 18), ('о', 19), ('его', 20), ('за', 21)]
(<PAD>, 0)
(<UNK>, 1)


# Модель рекурентной нейросети

In [None]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
        
    def __init__(self,
                 vocab_dim,
                 pos_tag_dim,
                 emb_dim = 20,
                 hidden_dim = 20, 
                 num_layers = 3,
                 dropout = 0,
                 bnorm = False,
                 bidirectional = False):
        super(Encoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.bnorm = bnorm

        # векторное представление токенов
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        # классификация по скрытому слою LSTM
        self.classifier = torch.nn.Linear(hidden_dim, pos_tag_dim)

        self.batchnorm = torch.nn.BatchNorm1d(hidden_dim)
        
    def forward(self, input):
        out = self.embedding(input) 
        out, (h, c) = self.encoder(out)
        if self.bnorm:
            out = self.batchnorm(torch.transpose(out, 1, 2))
            out = torch.transpose(out, 1, 2)
        return self.classifier(out)

# Исходный вариант модели

In [None]:
encoder = Encoder(vocab_dim=len(token2idx), pos_tag_dim=len(pos2idx), num_layers=1)
encoder.to(device)

optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=pos2idx['<PAD>'])
encoder

Encoder(
  (embedding): Embedding(75002, 20)
  (encoder): LSTM(20, 20, batch_first=True)
  (classifier): Linear(in_features=20, out_features=18, bias=True)
  (norm): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

Обучение

In [None]:
writer = SummaryWriter(log_dir = f'/content/drive/MyDrive/Colab Notebooks/task_1-2/vocsize-{voc_size}/layers-1_hdim-20_dropout-0_bnorm-0')
call = callback(writer, data_test, pos2idx, token2idx, loss_function, delimeter = 100)

trainer(count_of_epoch = 10,
        batch_size = 64,
        model = encoder,
        dataset = data_train,
        pos_tag2idx = pos2idx,
        token2idx = token2idx,
        loss_function = loss_function,
        optimizer = optimizer,
        callback = call)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

# Увеличим количество слоев

In [None]:
for n_layers in [2,3,7]: #2,3,7
    encoder = Encoder(vocab_dim=len(token2idx), pos_tag_dim=len(pos2idx), num_layers=n_layers)
    encoder.to(device)

    optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=pos2idx['<PAD>'])

    writer = SummaryWriter(log_dir = f'/content/drive/MyDrive/Colab Notebooks/task_1-2/vocsize-{voc_size}/layers-{n_layers}_hdim-20_dropout-0_bnorm-0')
    call = callback(writer, data_test, pos2idx, token2idx, loss_function, delimeter = 100)

    trainer(count_of_epoch = 10,
            batch_size = 64,
            model = encoder,
            dataset = data_train,
            pos_tag2idx = pos2idx,
            token2idx = token2idx,
            loss_function = loss_function,
            optimizer = optimizer,
            callback = call)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

# Добавим Dropout

In [None]:
for n_layers in [3, 5]: #3, 5
    encoder = Encoder(vocab_dim=len(token2idx), pos_tag_dim=len(pos2idx), num_layers=n_layers, dropout=0.3)
    encoder.to(device)

    optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=pos2idx['<PAD>'])

    writer = SummaryWriter(log_dir = f'/content/drive/MyDrive/Colab Notebooks/task_1-2/vocsize-{voc_size}/layers-{n_layers}_hdim-20_dropout-03_bnorm-0')
    call = callback(writer, data_test, pos2idx, token2idx, loss_function, delimeter = 100)

    trainer(count_of_epoch = 10,
            batch_size = 64,
            model = encoder,
            dataset = data_train,
            pos_tag2idx = pos2idx,
            token2idx = token2idx,
            loss_function = loss_function,
            optimizer = optimizer,
            callback = call)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

  0%|          | 0/1189 [00:00<?, ?it/s]

# Увеличим скрытый слой

In [None]:
for n_layers in [1, 3]: #1, 3
    for h_dim in [50, 100]: #50, 100
        encoder = Encoder(vocab_dim=len(token2idx), pos_tag_dim=len(pos2idx), num_layers=n_layers, hidden_dim=h_dim)
        encoder.to(device)

        optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
        loss_function = torch.nn.CrossEntropyLoss(ignore_index=pos2idx['<PAD>'])

        writer = SummaryWriter(log_dir = f'/content/drive/MyDrive/Colab Notebooks/task_1-2/vocsize-{voc_size}/layers-{n_layers}_hdim-{h_dim}_dropout-0_bnorm-0')
        call = callback(writer, data_test, pos2idx, token2idx, loss_function, delimeter = 100)

        trainer(count_of_epoch = 10,
                batch_size = 64,
                model = encoder,
                dataset = data_train,
                pos_tag2idx = pos2idx,
                token2idx = token2idx,
                loss_function = loss_function,
                optimizer = optimizer,
                callback = call)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

# Добавим BatchNorm

In [None]:
for n_layers in [1]: #1,3
    for h_dim in [100]:
        encoder = Encoder(vocab_dim=len(token2idx), pos_tag_dim=len(pos2idx), num_layers=n_layers, hidden_dim=h_dim, bnorm=True)
        encoder.to(device)

        optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
        loss_function = torch.nn.CrossEntropyLoss(ignore_index=pos2idx['<PAD>'])

        writer = SummaryWriter(log_dir = f'/content/drive/MyDrive/Colab Notebooks/task_1-2/vocsize-{voc_size}/layers-{n_layers}_hdim-{h_dim}_dropout-0_bnorm-1')
        call = callback(writer, data_test, pos2idx, token2idx, loss_function, delimeter = 100)

        trainer(count_of_epoch = 10,
                batch_size = 64,
                model = encoder,
                dataset = data_train,
                pos_tag2idx = pos2idx,
                token2idx = token2idx,
                loss_function = loss_function,
                optimizer = optimizer,
                callback = call)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

  0%|          | 0/1190 [00:00<?, ?it/s]

# TensorBoard

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/content/drive/MyDrive/Colab Notebooks/task_1-2/'

# Выводы

Для всех моделей функция потерь стремится к нулю как на трейне, так и на тесте. До обучения качество всех моделей по метрикам приведенным в REPORT плохое. Даже после одной эпохи обучения точность обучения доходит примерно до 96%. Хуже всего классифицируются междометия и символы, так как их очень мало в выборке, что логично для новостных предложений (меджометия скорее встречаются в неформальном тексте). Лучше всего классифицируются предлоги и знаки пунктуации, что тоже логично, так как их много в выборке и они значительно отличаются от других токенов.

При увеличении числа слоев, функция потерь уменьшается медленнее, то есть модель нужно обучать дольше. Это ожидаемо, так как увеличение числа слоев усложняет модель.

Переобучение не наблюдали, тем более в случае большого числа слоев. Поэтому включение dropout бесполезно.

Увеличение размерности скрытого слоя ускоряет обучение, хотя итоговая точность классификации не сильно улучшается. 

Добавление BatchNorm позволяет ускорить процесс обучения, но незначительно.

Сокращение размера словаря практически не вносит изменений в графики функции потерь и качество модели. Это ожидаемо, так как словарь ограничивался несильно, убирались слова из тех, которые встречаются не чаще 1 раза. 


