In [1]:
# !pip install razdel transformers datasets seqeval wandb

In [2]:
import random
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # Add smth here if you need...

superseed = 1
set_random_seed(superseed)

### Чтение данных

In [4]:
# логирование результатов
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdanessely[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
data = pd.read_csv("ner_data_train.csv")
data.head(5)

Unnamed: 0,video_info,entities
0,<НАЗВАНИЕ:> Агент 117: Из Африки с любовью — Р...,"{""label"":""локация""\,""offset"":26\,""length"":6\,""..."
1,<НАЗВАНИЕ:> Коленвал Инфинити Ку икс 56= 5.6 V...,"{""label"":""организация""\,""offset"":196\,""length""..."
2,<НАЗВАНИЕ:> ВЫЗОВ ДЕМОНА = Вызвал Серого Челов...,"{""label"":""название проекта""\,""offset"":12\,""len..."
3,<НАЗВАНИЕ:> Довоенная немецкая кирха в Калинин...,"{""label"":""не найдено""\,""offset"":162\,""length"":..."
4,"<НАЗВАНИЕ:> ""Спартаку"" помогли судьи? Локомоти...","{""label"":""команда""\,""offset"":13\,""length"":8\,""..."


In [6]:
# данные спарсены с Толоки, поэтому могут иметь проблемы с символами и их нужно избежать,
# удалить лишние '\' например, преобразовать из str в список dict-ов
import json
df = data.copy()
df['entities'] = df['entities'].apply(lambda l: l.replace('\,', ',')if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: l.replace('\\\\', '\\')if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: '[' + l + ']'if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: json.loads(l)if isinstance(l, str) else l)

df.head(3)

Unnamed: 0,video_info,entities
0,<НАЗВАНИЕ:> Агент 117: Из Африки с любовью — Р...,"[{'label': 'локация', 'offset': 26, 'length': ..."
1,<НАЗВАНИЕ:> Коленвал Инфинити Ку икс 56= 5.6 V...,"[{'label': 'организация', 'offset': 196, 'leng..."
2,<НАЗВАНИЕ:> ВЫЗОВ ДЕМОНА = Вызвал Серого Челов...,"[{'label': 'название проекта', 'offset': 12, '..."


In [7]:
# Теперь из наших данных нам нужно извлечь для каждого слова (токена) его тег (label) из разметки, чтобы потом предать в модель классификации токенов
from razdel import tokenize

def extract_labels(item):

    # воспользуемся удобным токенайзером из библиотеки razdel,
    # она помимо разбиения на слова, сохраняет важные для нас числа - начало и конец слова в токенах

    raw_toks = list(tokenize(item['video_info']))
    words = [tok.text for tok in raw_toks]
    # words = item['video_info']
    # присвоим для начала каждому слову тег 'О' - тег, означающий отсутствие NER-а
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item['video_info'])
    # так как NER можем состаять из нескольких слов, то нам нужно сохранить эту инфорцию
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    labels = item['entities']
    if isinstance(labels, dict):
        labels = [labels]
    if labels is not None:
        for e in labels:
            if e['label'] != 'не найдено':
                e_words = sorted({idx for idx in char2word[e['offset']:e['offset']+e['length']] if idx is not None})
                if e_words:
                    word_labels[e_words[0]] = 'B-' + e['label']
                    for idx in e_words[1:]:
                        word_labels[idx] = 'I-' + e['label']
                else:
                    continue
            else:
                continue
        return {'tokens': words, 'tags': word_labels}
    else: return {'tokens': words, 'tags': word_labels}

In [8]:
print(extract_labels(df.iloc[0]))

{'tokens': ['<', 'НАЗВАНИЕ', ':', '>', 'Агент', '117', ':', 'Из', 'Африки', 'с', 'любовью', '—', 'Русский', 'тизер', '=', 'трейлер', '(', '2021', ')', '<', 'ОПИСАНИЕ', ':', '>', 'Лучший', 'Telegram', 'канал', 'о', 'кино', '<', 'LINK', '>', 'Сотрудничество', '<', 'LINK', '>', 'Дата', 'выхода', '26', 'августа', '2021', 'Оригинальное', 'название', ':', 'OSS', '117', ':', 'Alerte', 'rouge', 'en', 'Afrique', 'noire', 'Страна', ':', 'Франция', 'Режиссер', ':', 'Николя', 'Бедос', 'Жанр', ':', 'боевик', ',', 'комедия', 'В', 'главных', 'ролях', ':', 'Жан', 'Дюжарден', ',', 'Пьер', 'Нинэ', ',', 'Мелоди', 'Каста', ',', 'Наташа', 'Линдинжер', ',', 'Владимир', 'Иорданов', ',', 'Фату', 'Н', '’', 'Диайе', ',', 'Пол', 'Уайт', 'Мир', 'изменился', '.', 'Он', 'нет', '.', 'Судьба', 'заносит', 'легендарного', 'Агента', '117', 'в', 'Африку', ',', 'где', 'горячее', 'пустыни', 'только', 'женщины', '.', 'Вооруженный', 'неиссякаемой', 'уверенностью', 'в', 'себе', 'и', 'убийственной', 'харизмой', ',', 'он', 'мож

### Разбивка данных

In [9]:
from sklearn.model_selection import train_test_split
ner_data = [extract_labels(item) for i, item in df.iterrows()]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=superseed)

In [10]:
import pandas as pd
pd.options.display.max_colwidth = 300
pd.DataFrame(ner_train).sample(3, random_state=superseed)

Unnamed: 0,tokens,tags
3035,"[<, НАЗВАНИЕ, :, >, Убрался, в, дерево, ., Что, случилось, за, кадром, ?, Новогодний, конкурс, <, ОПИСАНИЕ, :, >, Условия, конкурса, просты, :, Для, Вконтакте, сделать, репост, этой, записи, (, <, LINK, >, 71769372_33169, ), Для, YouTube, написать, в, комментариях, #, дедебашит, Аренда, авто, Кр...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-организация, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1714,"[<, НАЗВАНИЕ, :, >, Так, в, России, не, учат, ,, новая, школа, программирования, от, Сбера, <, ОПИСАНИЕ, :, >, Не, так, много, людей, знает, что, такое, Школа, 21, (, School, 21, ), ,, кто, то, слышал, ,, но, по, прежнему, возникает, много, вопросов, :, как, учат, ,, как, попасть, ,, что, получи...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-организация, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-организация, O, B-организация, I-организация, O, O, O, O, B-персона, O, O, O, O, B-локация, O, O, O, O, B-п..."
280,"[<, НАЗВАНИЕ, :, >, КАНАН, АББАСОВ, =, ИНТЕРВЬЮ, с, актёром, сериала, "", ДИАГНОЗ, "", (, часть, 2, ), 😳, <, ОПИСАНИЕ, :, >, Интервью, с, актёром, моего, проекта, сериала, на, Rutube, "", Диагноз, "", Кананом, Аббасовым, как, проходили, съёмки, ?, какие, были, сложности, ?, жалеет, ли, Вадим, об, уч...","[O, O, O, O, B-персона, I-персона, O, O, O, O, O, O, B-название проекта, O, O, O, B-сезон, O, O, O, O, O, O, O, O, O, O, O, O, O, B-бренд, O, B-название проекта, I-название проекта, B-персона, I-персона, O, O, O, O, O, O, O, O, O, O, B-персона, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'B-Дата',
 'B-бренд',
 'B-вид спорта',
 'B-видеоигра',
 'B-команда',
 'B-лига',
 'B-локация',
 'B-модель',
 'B-название проекта',
 'B-организация',
 'B-персона',
 'B-сезон',
 'B-серия',
 'I-Дата',
 'I-бренд',
 'I-вид спорта',
 'I-видеоигра',
 'I-команда',
 'I-лига',
 'I-локация',
 'I-модель',
 'I-название проекта',
 'I-организация',
 'I-персона',
 'I-сезон',
 'I-серия']

In [12]:
labels_to_ids = {v:k for k,v in enumerate(label_list)}
ids_to_labels = {k:v for k,v in enumerate(label_list)}

In [13]:
class NERDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len=512):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        tokens = self.data.tokens[index]
        word_labels = self.data.tags[index]

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        tokenized_inputs = self.tokenizer(tokens, truncation=True,
                                     is_split_into_words=True,
                                     padding='max_length',
                                     max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        # pdb.set_trace()
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        # print(len(sentence), len(labels))

        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(labels[word_idx])
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        item = {key: torch.as_tensor(val) for key, val in tokenized_inputs.items()}
        item['labels'] = torch.as_tensor(label_ids)

        return item

  def __len__(self):
        return self.len

### Выбор модели

In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertModel, BertConfig
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# model_checkpoint = "ai-forever/ruBert-base"
# model_checkpoint = "cointegrated/rubert-tiny2"
model_checkpoint = "ai-forever/ruBert-large"

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    device=device,
    add_prefix_space=True
)

In [17]:
train_set = NERDataset(pd.DataFrame(ner_train), tokenizer=tokenizer)
test_set = NERDataset(pd.DataFrame(ner_test), tokenizer=tokenizer)

print("FULL Dataset: {}".format(len(ner_train) + len(ner_test)))
print("TRAIN Dataset: {}".format(len(ner_train)))
print("TEST Dataset: {}".format(len(ner_test)))

FULL Dataset: 6422
TRAIN Dataset: 5137
TEST Dataset: 1285


In [18]:
train_params = {'batch_size': 32,
                'shuffle': True,
                }

test_params = {'batch_size': 64,
                'shuffle': False,
                }

train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **test_params)

In [19]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# # проверка выхода
# model.to(device)
# inputs = next(iter(train_loader))
# input_ids = inputs["input_ids"]
# attention_mask = inputs["attention_mask"]
# labels = inputs["labels"]

# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# labels = labels.to(device)

# outputs = model(input_ids, attention_mask=attention_mask, labels=labels,
#                return_dict=False)
# initial_loss = outputs[0]
# initial_loss

In [21]:
# outputs = None

### Кастомные модели

In [22]:
class CustomHead(nn.Module):
    """
    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    """
    
    def __init__(self, model_checkpoint, hidden_dim, num_labels):
        super().__init__()
        self.bert = BertConfig.from_pretrained(model_checkpoint)
        self.in_features = self.bert.hidden_size

        self.model = nn.Sequential(
            nn.Linear(in_features=self.in_features, out_features=hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            # nn.BatchNorm1d(num_features=hidden_dim),
            nn.Linear(in_features=hidden_dim, out_features=hidden_dim),
            nn.ReLU(),
            nn.Linear(in_features=hidden_dim, out_features=num_labels)
        )

        
    def forward(self, bert_output):
        x = bert_output
        return self.model(x)

In [23]:
# custom_classifier = CustomHead(model_checkpoint, 512, 27)
# custom_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# custom_model.classifier = custom_classifier

In [24]:
# custom_model.to(device)
# inputs = next(iter(train_loader))
# input_ids = inputs["input_ids"]
# attention_mask = inputs["attention_mask"]
# labels = inputs["labels"]

# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# labels = labels.to(device)

# outputs = custom_model(input_ids, attention_mask=attention_mask, labels=labels,
#                return_dict=True)
# initial_loss = outputs[0]
# initial_loss

### Кастомный bertclass

In [25]:
from transformers import BertForTokenClassification

In [26]:
# in_features = 312
# latent_head_dim = 256
# num_labels = 27

# class BertTokenClassHead(BertForTokenClassification):
#     def __init__(self, config):
#         super().__init__(config)
#         self.classifier = nn.Linear(in_features=in_features, out_features=num_labels)
#         # self.classifier = nn.Sequential(
#             # nn.Linear(in_features=in_features, out_features=latent_head_dim),
#             # nn.ReLU(),
#             # nn.Dropout(p=0.3),
#             # nn.BatchNorm1d(num_features=latent_head_dim),
#             # nn.Linear(in_features=latent_head_dim, out_features=latent_head_dim),
#             # nn.ReLU(),
#             # nn.Linear(in_features=latent_head_dim, out_features=num_labels)
#         # )


In [27]:
# custom_model = BertTokenClassHead.from_pretrained(model_checkpoint, num_labels=len(label_list))

In [28]:
# custom_model.to(device)
# inputs = next(iter(train_loader))
# input_ids = inputs["input_ids"]
# attention_mask = inputs["attention_mask"]
# labels = inputs["labels"]

# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# labels = labels.to(device)

# outputs = custom_model(input_ids, attention_mask=attention_mask, labels=labels,
#                return_dict=True)
# initial_loss = outputs[0]
# initial_loss

### метрика

In [29]:
from sklearn.metrics import f1_score
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [30]:
example = ner_train[1]
metric.compute(predictions=[example['tags']], references=[example['tags']])
metric.compute(predictions=[example['tags']], references=[example['tags']])['overall_f1']

1.0

### trainloop

In [31]:
def train_epoch(train_loader, optimizer, model):
    tr_loss, tr_accuracy_seq, tr_accuracy_skl = 0, 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    set_random_seed(superseed)
    for idx, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                               return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 200==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 200 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        labels_str = [ids_to_labels[id.item()] for id in labels]
        predictions_str = [ids_to_labels[id.item()] for id in predictions]
        metric_dict = metric.compute(predictions=[predictions_str], references=[labels_str], zero_division=0)
        tmp_tr_accuracy_seq = metric_dict['overall_f1']
        tr_accuracy_seq += tmp_tr_accuracy_seq
        
        metrics = {
            'train/loss': loss.item(),
            'train/f1': tmp_tr_accuracy_seq
        }
        wandb.log(metrics)

        # tmp_tr_accuracy_skl = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
        # tr_accuracy_skl += tmp_tr_accuracy_skl

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=10
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy_seq = tr_accuracy_seq / nb_tr_steps
    # tr_accuracy_skl = tr_accuracy_skl / nb_tr_steps

    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training seqeval F1 epoch: {tr_accuracy_seq}")
    # print(f"Training sklearn F1 epoch: {tr_accuracy_skl}")
    return epoch_loss, tr_accuracy_seq

In [32]:
def valid_epoch(testing_loader, model):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy_seq, eval_accuracy_skl = 0, 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    set_random_seed(superseed)
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                                     return_dict=False)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            # tmp_eval_accuracy = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
            labels_str = [ids_to_labels[id.item()] for id in labels]
            predictions_str = [ids_to_labels[id.item()] for id in predictions]
            metric_dict = metric.compute(predictions=[predictions_str], references=[labels_str], zero_division=0)
            tmp_eval_accuracy_seq = metric_dict['overall_f1']
            eval_accuracy_seq += tmp_eval_accuracy_seq

            
            # tmp_eval_accuracy_skl = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
            # eval_accuracy_skl += tmp_eval_accuracy_skl


    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy_seq = eval_accuracy_seq / nb_eval_steps
    # eval_accuracy_skl = eval_accuracy_skl / nb_eval_steps

    metrics = {
        'val/loss': eval_loss,
        'val/f1': eval_accuracy_seq
    }
    wandb.log(metrics)

    print(f"Validation Loss: {eval_loss}")
    print(f"Validation seqeval F1: {eval_accuracy_seq}")
    # print(f"Validation sklearn F1: {eval_accuracy_skl}")

    # return labels, predictions
    return eval_loss, eval_accuracy_seq

In [33]:
def plot_losses(train_losses, test_losses, train_metrics, test_metrics):
    # clear_output()
    fig, axs = plt.subplots(1, 2, figsize=(13, 4))
    axs[0].plot(range(1, len(train_losses) + 1), train_losses, label='train')
    axs[0].plot(range(1, len(test_losses) + 1), test_losses, label='test')
    axs[0].set_ylabel('loss')

    axs[1].plot(range(1, len(train_metrics) + 1), train_metrics, label='train')
    axs[1].plot(range(1, len(test_metrics) + 1), test_metrics, label='test')
    axs[1].set_ylabel('metric')

    for ax in axs:
        ax.set_xlabel('epoch')
        ax.legend()

    plt.show()

In [34]:
def train(
    train_loader, test_loader, num_epoch,
    optimizer, model, scheduler=None
    ):

    train_losses, test_losses = [], []
    train_metrics, test_metrics = [], []

    for i in range(1, num_epoch + 1):
        print(f'Epoch {i}:')
        set_random_seed(superseed)
        train_loss, train_metric = train_epoch(train_loader, optimizer, model)
        train_losses.append(train_loss)
        train_metrics.append(train_metric)

        test_loss, test_metric = valid_epoch(test_loader, model)
        test_losses.append(test_loss)
        test_metrics.append(test_metric)

        if scheduler is not None:
            scheduler.step(test_loss)
#             wandb.log({'lr': scheduler.get_last_lr()})
    plot_losses(train_losses, test_losses, train_metrics, test_metrics)
    return test_metrics[-1]
    # return (train_losses, test_losses, train_metrics, test_metrics)


### train

In [35]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.to(device)


lr = 0.00001
num_epoch = 7
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.9)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=2, gamma=0.1)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                        factor=0.1,
#                                                        patience=2,
#                                                        threshold=0.01,
#                                                        verbose=True
#                                                       )

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
wandb.init(
    # set the wandb project where this run will be logged
    project="dh-perm",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "optimizer": optimizer,
    "scheduler": scheduler,
    "model": "rubert-large-default",
    "epochs": num_epoch,
    }
)


final_metric = train(train_loader, test_loader, num_epoch, optimizer, model, scheduler)

# train_losses, test_losses, train_metrics, test_metrics = train(test_loader, test_loader, 1, optimizer, model)
print(f'F1 on test: {final_metric}')

wandb.finish()

Problem at: /tmp/ipykernel_12316/3534114478.py 1 <module>


KeyboardInterrupt: 

In [37]:
from tqdm.notebook import tqdm

submission = pd.DataFrame(columns=[['video_info', 'entities_prediction']])
submission['entities_prediction'] = submission['entities_prediction'].astype('object')
def sample_submission(df, tokenizer, model, pipe, submission):
    for i, elem in tqdm(df.iterrows()):
#         print(elem)
        tokens = razdel_info(elem)
#         print(tokens)
        _, _, labels = predict_ner(tokens, tokenizer, model, pipe, verbose=False)
        submission.loc[i, 'video_info'] = df['video_info'].iloc[i]
#         print(elem)
        submission.loc[i, 'entities_prediction'] = [[label] for label in labels]
    return submission

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device='cpu')

def predict_ner(text, tokenizer, model, pipe, verbose=True):
    tokens = tokenizer(text, truncation=True, is_split_into_words=True, return_tensors='pt')
    tokens = {k: v.to(model.device) for k, v in tokens.items()}

    with torch.no_grad():
        pred = model(**tokens)
    # print(pred.logits.shape)
    indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
    token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    labels = []
    for t, idx in zip(token_text, indices):
        if '##' not in t:
            labels.append(label_list[idx])
        if verbose:
            print(f'{t:15s} {label_list[idx]:10s}')
    return text, pipe(text), labels

In [39]:
test_submission = df[:10][['video_info']]


In [40]:
sample_submission(test_submission, tokenizer, model, pipe, submission)

NameError: name 'pipe' is not defined

In [37]:
# model.save_pretrained('rubert-large-7e-0.5442')

In [38]:
custom_classifier = CustomHead(model_checkpoint, hidden_dim=512, num_labels=len(label_list))
custom_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
custom_model.classifier = custom_classifier
custom_model.to(device)
# for param in custom_model.bert.named_parameters():
#     param[1].requires_grad=False

lr=0.00001
num_epoch = 10
optimizer = torch.optim.Adam(params=custom_model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.85)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
wandb.init(
    # set the wandb project where this run will be logged
    project="dh-perm",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "optimizer": optimizer,
    "scheduler": scheduler,
    "model": "rubert-large-custom",
    "epochs": num_epoch,
    "base-freezed": False,
    }
)

final_metric = train(train_loader, test_loader, num_epoch, optimizer, custom_model)
print(f'F1 on test: {final_metric}')

wandb.finish()

Epoch 1:


  0%|          | 0/161 [00:00<?, ?it/s]

Training loss per 200 training steps: 3.2702574729919434


100%|██████████| 161/161 [02:13<00:00,  1.20it/s]


Training loss epoch: 1.360754831237082
Training seqeval F1 epoch: 0.00023672198234454868


  0%|          | 0/21 [00:00<?, ?it/s]

Validation loss per 100 evaluation steps: 0.8791568279266357


100%|██████████| 21/21 [00:16<00:00,  1.25it/s]


Validation Loss: 0.8637285289310274
Validation seqeval F1: 0.0
Epoch 2:


  0%|          | 0/161 [00:00<?, ?it/s]

Training loss per 200 training steps: 0.8783538341522217


 57%|█████▋    | 92/161 [01:17<00:57,  1.19it/s]


KeyboardInterrupt: 

## че-то старое

#### У Bert свой собсвенный токенайзер, который разбивает слова на мелкие токены, поэтому нам нужно корректно сопоставить токены и соответсвующие им неры.

In [None]:
# Посчитаем метрики на отложенном датасете

predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd

In [None]:
cm = pd.DataFrame(
    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),
    index=label_list,
    columns=label_list
)
cm

In [None]:
model.save_pretrained('ner_bert.bin')
tokenizer.save_pretrained('ner_bert.bin')

### Посмотрим на результаты

In [None]:
# text = ' '.join(ner_train[25]['tokens'])
text = ner_train[25]['tokens']

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device='cpu')

def predict_ner(text, tokenizer, model, pipe, verbose=True):
    tokens = tokenizer(text, truncation=True, is_split_into_words=True, return_tensors='pt')
    tokens = {k: v.to(model.device) for k, v in tokens.items()}

    with torch.no_grad():
        pred = model(**tokens)
    # print(pred.logits.shape)
    indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
    token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    labels = []
    for t, idx in zip(token_text, indices):
        if '##' not in t:
            labels.append(label_list[idx])
        if verbose:
            print(f'{t:15s} {label_list[idx]:10s}')
    return text, pipe(text), labels

In [None]:
predict_ner(text, tokenizer, model, pipe)

### Тестового датасета у вас пока нет, по которому будет считаться метрика на лидерборде, но прогоним для примера через нашу отложенную выборку, чтобы понять формат выходных данных.
ВАЖНО: в тестовом датасете у вас будет тест в том же формате, что он был в трейне 'video_info', в финальном сабмишене эту колонку и индексы менять нельзя, нужно будет только заполнить колонку 'entities_prediction'

In [None]:
from tqdm.notebook import tqdm

submission = pd.DataFrame(columns=[['video_info', 'entities_prediction']])
submission['entities_prediction'] = submission['entities_prediction'].astype('object')
def sample_submission(text, tokenizer, model, pipe, submission):
    for i, elem in tqdm(enumerate(ner_test)):
        _, _, labels = predict_ner(elem['tokens'], tokenizer, model, pipe, verbose=False)
        submission.loc[i, 'video_info'] = elem

        submission.loc[i, 'entities_prediction'] = [[label] for label in labels]
    return submission

In [None]:
result = sample_submission(text, tokenizer, model, pipe, submission)

In [None]:
result