In [1]:
# visualization libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# pytorch libraries
import torch # the main pytorch library
import torch.nn as nn # the sub-library containing Softmax, Module and other useful functions
import torch.optim as optim # the sub-library containing the common optimizers (SGD, Adam, etc.)

# huggingface's transformers library
from transformers import AutoTokenizer,AutoModel, AutoModelForTokenClassification
from transformers import RobertaForTokenClassification, RobertaTokenizer,BertTokenizerFast, BertForTokenClassification
from transformers import BertConfig
# huggingface's datasets library
from datasets import load_dataset

# the tqdm library used to show the iteration progress
import tqdm
tqdmn = tqdm.notebook.tqdm

In [2]:
!pip install datasets transformers[sentencepiece] -qq
!pip install seqeval -qq # evaluation metrics for training (not the competition metric)
!pip install --upgrade wandb -qq # experiment tracking

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [5]:
model_name = "ai-forever/ruBert-large" #"sberbank-ai/ruBert-base" #"sberbank-ai/ruRoberta-large" # "../input/deeppavlov-rubertbasecased/"#

tokenizer = BertTokenizerFast.from_pretrained(model_name)
#tokenizer = RobertaTokenizer.from_pretrained(model_name)

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

In [6]:
# set the number of epochs 
n_epochs = 1
#With `pytorch` we are able to move the python calculations to the GPU. 
#To do this we define the `device` on which we wish to run the calculations. Depending if `cuda` 
#(the GPU drivers that enable running calculations on the graphic card) is enabled on the machine, we define the device as follows: 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
a = ['O', 'B-MODEL', 'I-MODEL', 'B-COUNTRY', 'I-COUNTRY', 'B-XITEM', 'I-XITEM', 'B-YITEM', 'I-YITEM', 'B-ZITEM', 'I-ZITEM', 'B-XPACK', 'I-XPACK', 'B-YPACK', 'I-YPACK', 'B-ZPACK', 'I-ZPACK', 'B-MATERIAL', 'I-MATERIAL', 'B-COMPLECT', 'I-COMPLECT', 'B-COLOR', 'I-COLOR', 'B-OS', 'I-OS', 'B-PROCESSOR', 'I-PROCESSOR', 'B-VIDEO', 'I-VIDEO', 'B-RAM', 'I-RAM', 'B-KERNELS', 'I-KERNELS', 'B-SSD', 'I-SSD', 'B-DIAGONAL', 'I-DIAGONAL']

In [8]:
# dir 
dir = '/kaggle/input/bert-finetune/'
train_f = f'{dir}train.txt'
dev_f = f'{dir}val.txt'
test_f = f'{dir}test.txt'
real_f = f'{dir}realData.txt'

# TRAINING HYPERPARAMS
BS = 10
GRAD_ACC = 2
LR = 5e-5
WD = 0.01
WARMUP = 0.1
N_EPOCHS = 3

label2id = {
}

id2label={
}

In [9]:
for i in range(len(a)):
    label2id[a[i]]=i
    id2label[i]=a[i]

In [10]:
label2id

{'O': 0,
 'B-MODEL': 1,
 'I-MODEL': 2,
 'B-COUNTRY': 3,
 'I-COUNTRY': 4,
 'B-XITEM': 5,
 'I-XITEM': 6,
 'B-YITEM': 7,
 'I-YITEM': 8,
 'B-ZITEM': 9,
 'I-ZITEM': 10,
 'B-XPACK': 11,
 'I-XPACK': 12,
 'B-YPACK': 13,
 'I-YPACK': 14,
 'B-ZPACK': 15,
 'I-ZPACK': 16,
 'B-MATERIAL': 17,
 'I-MATERIAL': 18,
 'B-COMPLECT': 19,
 'I-COMPLECT': 20,
 'B-COLOR': 21,
 'I-COLOR': 22,
 'B-OS': 23,
 'I-OS': 24,
 'B-PROCESSOR': 25,
 'I-PROCESSOR': 26,
 'B-VIDEO': 27,
 'I-VIDEO': 28,
 'B-RAM': 29,
 'I-RAM': 30,
 'B-KERNELS': 31,
 'I-KERNELS': 32,
 'B-SSD': 33,
 'I-SSD': 34,
 'B-DIAGONAL': 35,
 'I-DIAGONAL': 36}

In [11]:
# read all lines from train
with open(train_f,'r') as train:
    train_words = train.readlines()

# read all lines from dev
with open(dev_f,'r') as dev:
    dev_words = dev.readlines()

# read all lines from test    
with open(test_f,'r') as test:
    test_words = test.readlines()   
    
# read all lines from test    
with open(real_f,'r') as test:
    real_words = test.readlines()   

In [12]:
def get_tokens_ners(words):
    # 
    # read text
    tokens, ner_tags = [],[]
    for word in words:
        if len(word.strip())>0:
            #print(word)
            token, ner = word.split()
            ner = ner.strip('\n')
            tokens.append(token)
            ner_tags.append(ner)
    return tokens, ner_tags

In [13]:
train_labels = set(get_tokens_ners(train_words)[1])
val_labels = set(get_tokens_ners(dev_words)[1])
test_labels = set(get_tokens_ners(test_words)[1])
real_labels = set(get_tokens_ners(real_words)[1])

# No MISC
print(train_labels,'\n', val_labels,'\n', test_labels,'\n', real_labels)

{'I-YITEM', 'B-YPACK', 'B-YITEM', 'I-YPACK', 'I-PROCESSOR', 'B-DIAGONAL', 'I-OS', 'B-MATERIAL', 'B-XPACK', 'B-MODEL', 'B-VIDEO', 'I-COUNTRY', 'B-OS', 'B-RAM', 'I-MATERIAL', 'I-XITEM', 'I-DIAGONAL', 'I-MODEL', 'I-COMPLECT', 'I-RAM', 'B-XITEM', 'B-COUNTRY', 'I-COLOR', 'O', 'I-XPACK', 'B-ZITEM', 'I-ZPACK', 'B-ZPACK', 'B-SSD', 'B-COMPLECT', 'I-VIDEO', 'B-COLOR', 'I-SSD', 'B-PROCESSOR', 'I-ZITEM'} 
 {'I-YITEM', 'B-YPACK', 'B-YITEM', 'I-YPACK', 'I-PROCESSOR', 'B-DIAGONAL', 'I-OS', 'B-MATERIAL', 'B-XPACK', 'B-MODEL', 'B-VIDEO', 'I-COUNTRY', 'B-OS', 'B-RAM', 'I-MATERIAL', 'I-XITEM', 'I-DIAGONAL', 'I-MODEL', 'I-COMPLECT', 'I-RAM', 'B-COUNTRY', 'B-XITEM', 'I-COLOR', 'O', 'I-ZPACK', 'I-XPACK', 'B-ZITEM', 'B-ZPACK', 'B-SSD', 'B-COMPLECT', 'I-VIDEO', 'B-COLOR', 'I-SSD', 'B-PROCESSOR', 'I-ZITEM'} 
 {'I-YITEM', 'B-YPACK', 'B-YITEM', 'I-YPACK', 'I-PROCESSOR', 'B-DIAGONAL', 'I-OS', 'B-MATERIAL', 'B-XPACK', 'B-MODEL', 'B-VIDEO', 'I-COUNTRY', 'B-OS', 'B-RAM', 'I-MATERIAL', 'I-XITEM', 'I-DIAGONAL', 'I-MOD

In [14]:
text = ' '.join(get_tokens_ners(train_words)[0])
text[:500]

'огла ##вление : сете ##вои фильтр te ##ss ##an t ##s - 329 описание : сете ##вои фильтр серы ##и предназначен для подключения электропри ##боров . он оснащен 1 + 3 ##us ##b ; 1 шт . ; 3 шт . розет ##ками и us ##b портами , а также имеет индикатор включения и защитные штор ##ки на розет ##ках . максима ##льны ##и выход ##нои ток составляет 16 а , а максимальная рабочая мощность - 3600 в ##т . размеры фильтра : высота - 7 . 8 см , ширина - 10 . 6 см , глубина - 5 . 2 см . он поддерживает напряжени'

In [15]:
text = ' '.join(get_tokens_ners(dev_words)[0])
text[:500]

'огла ##вление : беспровод ##ная компьютерная мышь " эргономи ##чная ; тонкая ; чер ##ны ##и ; 10 м ; правая рука ; опти ##ческая ; 800 / 1200 / 1600 d ##p ##i ; us ##b ; 12 мм ; 27 мм ; 57 мм ; подсветка ; кита ##и ; 20 см ; игровая мышка ; беспровод ##ное ; 3 шт . ; us ##b заряд ##ны ##и кабель ; 162 г " описание : беспровод ##ная компьютерная мышь с эргономи ##чным , тонким диза ##ином . цвет чер ##ны ##и . радиус де ##ист ##вия 10 м . подходит для право ##и и лево ##и руки . опти ##ческая с р'

In [16]:
text = ' '.join(get_tokens_ners(real_words)[0])
text[:500]

'огла ##вление : мышь def ##ender ve ##no ##m g ##m - 640 ##l описание : мышь провод ##ная def ##ender ve ##no ##m g ##m - 640 ##l оптими ##зирована для использования в играх . высоко ##точ ##ны ##и опти ##чески ##и сенсор характеризуется максимальным разрешением 3200 d ##p ##i . есть возможность изменения разрешения . доступные дополнительные режимы работы – 1200 , 1600 и 2400 d ##p ##i . общее количество кнопок – 8 . мышь провод ##ная def ##ender ve ##no ##m g ##m - 640 ##l подключается с помощ'

In [17]:
text = ' '.join(get_tokens_ners(test_words)[0])
text[:500]

'огла ##вление : домаш ##нии планетар ##ии " космос " с us ##b t ##ype - c описание : домаш ##нии планетар ##ии мини на потолок . us ##b t ##ype - c . нет . доп . опции проект ##ора : ноч ##ник ; детски ##и ; космос . питание : от сети . гарантии ##ны ##и срок : 1 год . страна производства : кита ##и . вес с упаков ##ко ##и ( кг ) : 0 . 6 кг . вес без упаковки ( кг ) : 0 . 2 кг . ширина предмета : 12 см . глубина предмета : 12 см . высота предмета : 10 см . комплекта ##ция : проект ##ор ; кабель '

In [18]:
id2label

{0: 'O',
 1: 'B-MODEL',
 2: 'I-MODEL',
 3: 'B-COUNTRY',
 4: 'I-COUNTRY',
 5: 'B-XITEM',
 6: 'I-XITEM',
 7: 'B-YITEM',
 8: 'I-YITEM',
 9: 'B-ZITEM',
 10: 'I-ZITEM',
 11: 'B-XPACK',
 12: 'I-XPACK',
 13: 'B-YPACK',
 14: 'I-YPACK',
 15: 'B-ZPACK',
 16: 'I-ZPACK',
 17: 'B-MATERIAL',
 18: 'I-MATERIAL',
 19: 'B-COMPLECT',
 20: 'I-COMPLECT',
 21: 'B-COLOR',
 22: 'I-COLOR',
 23: 'B-OS',
 24: 'I-OS',
 25: 'B-PROCESSOR',
 26: 'I-PROCESSOR',
 27: 'B-VIDEO',
 28: 'I-VIDEO',
 29: 'B-RAM',
 30: 'I-RAM',
 31: 'B-KERNELS',
 32: 'I-KERNELS',
 33: 'B-SSD',
 34: 'I-SSD',
 35: 'B-DIAGONAL',
 36: 'I-DIAGONAL'}

In [19]:
label2id

{'O': 0,
 'B-MODEL': 1,
 'I-MODEL': 2,
 'B-COUNTRY': 3,
 'I-COUNTRY': 4,
 'B-XITEM': 5,
 'I-XITEM': 6,
 'B-YITEM': 7,
 'I-YITEM': 8,
 'B-ZITEM': 9,
 'I-ZITEM': 10,
 'B-XPACK': 11,
 'I-XPACK': 12,
 'B-YPACK': 13,
 'I-YPACK': 14,
 'B-ZPACK': 15,
 'I-ZPACK': 16,
 'B-MATERIAL': 17,
 'I-MATERIAL': 18,
 'B-COMPLECT': 19,
 'I-COMPLECT': 20,
 'B-COLOR': 21,
 'I-COLOR': 22,
 'B-OS': 23,
 'I-OS': 24,
 'B-PROCESSOR': 25,
 'I-PROCESSOR': 26,
 'B-VIDEO': 27,
 'I-VIDEO': 28,
 'B-RAM': 29,
 'I-RAM': 30,
 'B-KERNELS': 31,
 'I-KERNELS': 32,
 'B-SSD': 33,
 'I-SSD': 34,
 'B-DIAGONAL': 35,
 'I-DIAGONAL': 36}

In [20]:
import copy

def get_dict_tokens_ners(words):
    

    idx, tokens, ner_tags  = 0, [],[]
    dataset_json = []
    for line in words:

        line = line.strip('\n')
        if len(line.strip()) == 0:

            cur_tokens = copy.deepcopy(tokens)
            cur_ner_tags =copy.deepcopy(ner_tags)

            dict_ner = {'id': idx, 'tokens': cur_tokens,'length': len(cur_tokens), 'ner_tags_str': cur_ner_tags, 'ner_tags': [ label2id[i] for i in cur_ner_tags]}
            dataset_json.append(dict_ner)

            idx += 1
            tokens.clear()
            ner_tags.clear()
        else:
            token, ner = line.split()

            # check if ner tag not in token
            if token not in list(id2label.keys()):
                ner = ner.strip('\n')
                tokens.append(token)
                ner_tags.append(ner)
    data = {}
    data['data'] = dataset_json      
    return data

In [21]:
%%time
import json

names = ['train', 'dev', 'test', 'real']
for idx, words in enumerate([train_words, dev_words, test_words, real_words]):
    print(names[idx])
    data = get_dict_tokens_ners(words)
    with open(f'{names[idx]}_data.json', 'w') as f:
        json.dump(data, f)

train
dev
test
real
CPU times: user 11.6 s, sys: 141 ms, total: 11.7 s
Wall time: 11.7 s


In [22]:
tokens, ner_tags = [],[]
for word in words:
    if len(word.strip())>0:
        #print(word)
        token, ner = word.split()
        ner = ner.strip('\n')
        tokens.append(token)
        ner_tags.append(ner)
        
text = ' '.join(tokens)

In [23]:
data_files = {"train": 'train_data.json', 'val':'dev_data.json', 'test':'test_data.json', 'real':'real_data.json'}

In [24]:
load_json_dataset = load_dataset('json', data_files=data_files,field ='data')
load_json_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating real split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'length', 'id', 'ner_tags', 'ner_tags_str'],
        num_rows: 7452
    })
    val: Dataset({
        features: ['tokens', 'length', 'id', 'ner_tags', 'ner_tags_str'],
        num_rows: 718
    })
    test: Dataset({
        features: ['tokens', 'length', 'id', 'ner_tags', 'ner_tags_str'],
        num_rows: 4441
    })
    real: Dataset({
        features: ['tokens', 'length', 'id', 'ner_tags', 'ner_tags_str'],
        num_rows: 41
    })
})

In [25]:
MAX_LEN = max([max(load_json_dataset['train']['length']), max(load_json_dataset['val']['length']), max(load_json_dataset['test']['length'])])
MAX_LEN

222

In [26]:
load_json_dataset['train']

Dataset({
    features: ['tokens', 'length', 'id', 'ner_tags', 'ner_tags_str'],
    num_rows: 7452
})

In [27]:
#num_labels = dataset['train'].features['ner_tags'].feature.num_classes
num_labels = len(list(label2id.keys()))
num_labels

37

In [28]:
def add_encodings(example):
    """Processing the example
    
    Args:
        example (dict): The dataset example.
    
    Returns:
        dict: The dictionary containing the following updates:
            - input_ids: The list of input ids of the tokens.
            - attention_mask: The attention mask list.
            - ner_tags: The updated ner_tags.
    
    """
    # get the encodings of the tokens. The tokens are already split, that is why we must add is_split_into_words=True

    try:
        encodings = tokenizer(example['tokens'], truncation=True, padding='max_length', max_length = MAX_LEN, is_split_into_words=True)
        
        # extend the ner_tags so that it matches the max_length of the input_ids
        labels = example['ner_tags'] + [0] * (MAX_LEN - len(example['ner_tags']))
        
        # return the encodings and the extended ner_tags
        return { **encodings, 'labels': labels }
    except Exception as ex:
        print(ex)
        #print(example['tokens'])
        return 

In [29]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [30]:
print('j')

j


In [31]:
# modify/format all datasets so that they include the 'input_ids', 'attention_mask' 
# and 'labels' used to train and evaluate the model
#load_json_dataset = load_json_dataset.map(add_encodings)
tokenized_datasets = load_json_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=load_json_dataset["train"].column_names,
)

Map:   0%|          | 0/7452 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/718 [00:00<?, ? examples/s]

Map:   0%|          | 0/4441 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [32]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

2024-05-22 08:00:47.743777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 08:00:47.743914: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 08:00:47.882238: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [33]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    1,    2,    2,    2,    2,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    1,    2,    2,    2,    2,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    7,    8,    8,    8,    0,
            0,    0,    9,   10,   10,   10,    0,    0,    0,    5,    6,    6,
            6,    0,    0,  

In [34]:
# format the datasets so that we return only 'input_ids', 'attention_mask' and 'labels' 
# making it easier to train and validate the model
#load_json_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 0, 0, 0, 9, 10, 10, 10, 0, 0, 0, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 22, 22, 22, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [35]:
label2id

{'O': 0,
 'B-MODEL': 1,
 'I-MODEL': 2,
 'B-COUNTRY': 3,
 'I-COUNTRY': 4,
 'B-XITEM': 5,
 'I-XITEM': 6,
 'B-YITEM': 7,
 'I-YITEM': 8,
 'B-ZITEM': 9,
 'I-ZITEM': 10,
 'B-XPACK': 11,
 'I-XPACK': 12,
 'B-YPACK': 13,
 'I-YPACK': 14,
 'B-ZPACK': 15,
 'I-ZPACK': 16,
 'B-MATERIAL': 17,
 'I-MATERIAL': 18,
 'B-COMPLECT': 19,
 'I-COMPLECT': 20,
 'B-COLOR': 21,
 'I-COLOR': 22,
 'B-OS': 23,
 'I-OS': 24,
 'B-PROCESSOR': 25,
 'I-PROCESSOR': 26,
 'B-VIDEO': 27,
 'I-VIDEO': 28,
 'B-RAM': 29,
 'I-RAM': 30,
 'B-KERNELS': 31,
 'I-KERNELS': 32,
 'B-SSD': 33,
 'I-SSD': 34,
 'B-DIAGONAL': 35,
 'I-DIAGONAL': 36}

In [36]:
label_names = list(label2id.keys())
label_names

['O',
 'B-MODEL',
 'I-MODEL',
 'B-COUNTRY',
 'I-COUNTRY',
 'B-XITEM',
 'I-XITEM',
 'B-YITEM',
 'I-YITEM',
 'B-ZITEM',
 'I-ZITEM',
 'B-XPACK',
 'I-XPACK',
 'B-YPACK',
 'I-YPACK',
 'B-ZPACK',
 'I-ZPACK',
 'B-MATERIAL',
 'I-MATERIAL',
 'B-COMPLECT',
 'I-COMPLECT',
 'B-COLOR',
 'I-COLOR',
 'B-OS',
 'I-OS',
 'B-PROCESSOR',
 'I-PROCESSOR',
 'B-VIDEO',
 'I-VIDEO',
 'B-RAM',
 'I-RAM',
 'B-KERNELS',
 'I-KERNELS',
 'B-SSD',
 'I-SSD',
 'B-DIAGONAL',
 'I-DIAGONAL']

In [37]:
labels = load_json_dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'B-MODEL',
 'I-MODEL',
 'I-MODEL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MODEL',
 'I-MODEL',
 'I-MODEL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-YITEM',
 'I-YITEM',
 'I-YITEM',
 'I-YITEM',
 'O',
 'O',
 'O',
 'B-ZITEM',
 'I-ZITEM',
 'I-ZITEM',
 'I-ZITEM',
 'O',
 'O',
 'O',
 'B-XITEM',
 'I-XITEM',
 'I-XITEM',
 'I-XITEM',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MODEL',
 'I-MODEL',
 'I-MODEL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O'

In [38]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'COUNTRY': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'MODEL': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'XITEM': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'YITEM': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ZITEM': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [39]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [40]:
id2label

{0: 'O',
 1: 'B-MODEL',
 2: 'I-MODEL',
 3: 'B-COUNTRY',
 4: 'I-COUNTRY',
 5: 'B-XITEM',
 6: 'I-XITEM',
 7: 'B-YITEM',
 8: 'I-YITEM',
 9: 'B-ZITEM',
 10: 'I-ZITEM',
 11: 'B-XPACK',
 12: 'I-XPACK',
 13: 'B-YPACK',
 14: 'I-YPACK',
 15: 'B-ZPACK',
 16: 'I-ZPACK',
 17: 'B-MATERIAL',
 18: 'I-MATERIAL',
 19: 'B-COMPLECT',
 20: 'I-COMPLECT',
 21: 'B-COLOR',
 22: 'I-COLOR',
 23: 'B-OS',
 24: 'I-OS',
 25: 'B-PROCESSOR',
 26: 'I-PROCESSOR',
 27: 'B-VIDEO',
 28: 'I-VIDEO',
 29: 'B-RAM',
 30: 'I-RAM',
 31: 'B-KERNELS',
 32: 'I-KERNELS',
 33: 'B-SSD',
 34: 'I-SSD',
 35: 'B-DIAGONAL',
 36: 'I-DIAGONAL'}

In [41]:
# initialize the model and provide the 'num_labels' used to create the classification layer
model = BertForTokenClassification.from_pretrained(model_name , num_labels=num_labels)


# assign the 'id2label' and 'label2id' model configs
model.config.id2label = id2label
model.config.label2id = label2id

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
model.config.num_labels

37

In [43]:
len(label2id)

37

In [44]:
model.config

BertConfig {
  "_name_or_path": "ai-forever/ruBert-large",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MODEL",
    "2": "I-MODEL",
    "3": "B-COUNTRY",
    "4": "I-COUNTRY",
    "5": "B-XITEM",
    "6": "I-XITEM",
    "7": "B-YITEM",
    "8": "I-YITEM",
    "9": "B-ZITEM",
    "10": "I-ZITEM",
    "11": "B-XPACK",
    "12": "I-XPACK",
    "13": "B-YPACK",
    "14": "I-YPACK",
    "15": "B-ZPACK",
    "16": "I-ZPACK",
    "17": "B-MATERIAL",
    "18": "I-MATERIAL",
    "19": "B-COMPLECT",
    "20": "I-COMPLECT",
    "21": "B-COLOR",
    "22": "I-COLOR",
    "23": "B-OS",
    "24": "I-OS",
    "25": "B-PROCESSOR",
    "26": "I-PROCESSOR",
    "27": "B-VIDEO",
    "28": "I-VIDEO",
    "29": "B-RAM",
    "30": "I-RAM",
    "31": "B-KERNELS",
    "32": "I-KERNELS",
    "33"

In [45]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    save_strategy = "epoch",
    eval_steps = 50,
    logging_steps = 50,
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WD,
   # report_to='wandb', 
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP,
    fp16 = True,
    #push_to_hub=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [46]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["real"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,1.6109,0.398708,0.03012,0.041667,0.034965,0.89777
100,0.5698,0.281076,0.070588,0.1,0.082759,0.898832
150,0.3108,0.264664,0.304,0.316667,0.310204,0.920263
200,0.2305,0.338145,0.207143,0.241667,0.223077,0.908775
250,0.2161,0.323751,0.232394,0.275,0.251908,0.915339
300,0.1915,0.310949,0.233766,0.3,0.262774,0.908582
350,0.1914,0.328771,0.21134,0.341667,0.261146,0.894681
400,0.1625,0.342407,0.251799,0.291667,0.27027,0.891688
450,0.1519,0.355665,0.285714,0.333333,0.307692,0.909644
500,0.149,0.34158,0.315789,0.3,0.307692,0.895839


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1119, training_loss=0.2264715779349674, metrics={'train_runtime': 2516.8334, 'train_samples_per_second': 8.883, 'train_steps_per_second': 0.445, 'total_flos': 1.31696120040024e+16, 'train_loss': 0.2264715779349674, 'epoch': 3.0})

In [47]:
trainer.compute_loss(model, tokenized_datasets["train"].to_dict())

AttributeError: 'list' object has no attribute 'size'

In [None]:
from transformers import pipeline
model_checkpoint = "/kaggle/working/bert-finetuned-ner/checkpoint-373"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple", 
)
token_classifier("Произведенно в Китае. Диагональ экрана 16\". Размер упаковки 21 см на 32 см на 21 см. Красного цвета")

In [None]:
trainer.compute_loss(model=model, inputs=tokenized_datasets["real"].to_dict())

In [None]:
tokenized_datasets["real"]

In [None]:
tokenized_datasets["real"]

In [None]:
model.eval()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["real"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
model(load_json_dataset["train"]["tokens"][0])

In [None]:
load_json_dataset["train"]["tokens"][0]