Попробуем вместо обычной RNN применить BERT от hugging face

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install datasets transformers



In [3]:
import pandas as pd
import nltk
nltk.download('punkt')
from datasets import load_dataset, load_metric


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Токенизация


In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/alpha/train_supervised_dataset.csv').fillna('')

In [5]:
def apply_bio_tagging(row):
    """
    По токенам чека и разметке (то есть выделенным товарам и брендам) строим BIO-теги
    """
    tokens = [i.lower() for i in row["tokens"]]
    good = row["good"].split(',')[0].split()
    brand = row["brand"].split(',')[0].split()
    tags = ['O'] * len(tokens)
    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
    return tags

In [6]:
label_list = ["O", "B-GOOD", "I-GOOD", "B-BRAND", "I-BRAND", "PAD"]

In [7]:
print('процент бустых брендов:',(train_df['brand'] == '').sum()/train_df.shape[0])
print('количетсво:',(train_df['brand'] == '').sum())

процент бустых брендов: 0.3402
количетсво: 8505


немного почистим текст

In [8]:
import re

train_df.name = train_df.name.apply(lambda x: (re.sub('[^A-Za-zА-Яа-я1-9]', ' ', str(x))))
train_df

Unnamed: 0,id,name,good,brand
0,0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax
1,1,87191 Бусы для елки шарики 87191,бусы,
2,2,Футболка Piazza Italia WR 11446881,футболка,piazza italia
3,3,7 YI572 3X ONE ЗАКОЛКА ДЛЯ ВОЛОС ДЛЯ ДЕВОЧКИ,заколка,
4,4,Одежда вес 15,одежда,
...,...,...,...,...
24995,24995,Вода Саирме с г 5 мл,вода,sairme
24996,24996,Моя Семя 175л и ассортим,,моя семья
24997,24997,Рулет бисквитн Яшкино клубничный со слив,рулет,яшкино
24998,24998,46 75794371 Почвогрунт Цветочное счастье Фас...,почвогрунт,фаско


In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_ru = stopwords.words("russian")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
train_df['tokens'] = train_df.name.apply(lambda x: nltk.word_tokenize(x.lower(), language='russian'))
train_df['tags'] = train_df.apply(apply_bio_tagging,axis = 1)
#train_df.drop(columns = ['name'], inplace = True)

In [11]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

ner_train, ner_test = train_test_split(train_df, test_size=0.1, random_state=1)

для задачи возьмем **rubert-tiny2**, чтобы быстрее запустить наш BERT

In [12]:
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size = 16

In [13]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'name', 'good', 'brand', 'tokens', 'tags', '__index_level_0__'],
        num_rows: 22500
    })
    test: Dataset({
        features: ['id', 'name', 'good', 'brand', 'tokens', 'tags', '__index_level_0__'],
        num_rows: 2500
    })
})

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
'''
поскольку токенизатор может разбить слово на несколько токенов важно это учитывать,
т.к. по ним мы будем ставить BIO-теги
'''

def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i) #возвращает обычный айдишник в токенайзере
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Есть идея, что капитализация текста будет хорошей фичей. Попробуем посчитать соотношение заглавных букв к маленьким в train_df.

In [20]:
lower_count = 0
upper_count = 0
for text in train_df.name.to_list():
    lower_count += sum(1 for c in text if c.islower())
    upper_count += sum(1 for c in text if c.isupper())

lower_count, upper_count, upper_count/(lower_count+upper_count)

(516261, 177156, 0.25548263166319835)

на все слова выходит 25 % заглавных букв, что немного и немало, поэтому можно не делать lower()

закончим токенизацию

In [21]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

# Bert fine tuning

In [15]:
pip install seqeval



In [17]:
!pip install transformers[torch]
!pip install accelerate==0.20.3

Collecting accelerate>=0.20.3 (from transformers[torch])
  Using cached accelerate-0.21.0-py3-none-any.whl (244 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.1
    Uninstalling accelerate-0.20.1:
      Successfully uninstalled accelerate-0.20.1
Successfully installed accelerate-0.21.0
Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.21.0
    Uninstalling accelerate-0.21.0:
      Successfully uninstalled accelerate-0.21.0
Successfully installed accelerate-0.20.3


In [57]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
import transformers
import accelerate

In [59]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [60]:
import numpy as np

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    print(predictions.shape, labels.shape)
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [61]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    num_train_epochs=6
)

посомотрим на скор не обученной модели

In [62]:
trainer = Trainer(
    model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.evaluate()

(2500, 40, 6) (2500, 40)




{'eval_loss': 1.6919242143630981,
 'eval_precision': 0.07629970469061191,
 'eval_recall': 0.1629510120269874,
 'eval_f1': 0.10393376678048552,
 'eval_accuracy': 0.32458844362825473,
 'eval_runtime': 3.0881,
 'eval_samples_per_second': 809.572,
 'eval_steps_per_second': 101.358}

Заморозим параметры bert'а и попробуем обучить только голову для NER

In [63]:
for param in model.bert.parameters():
    param.requires_grad = False

In [64]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

trainer.train()



Step,Training Loss
500,0.9705
1000,0.6066
1500,0.546
2000,0.5184
2500,0.506
3000,0.4919
3500,0.4829
4000,0.483
4500,0.4777
5000,0.4711


TrainOutput(global_step=8439, training_loss=0.5193947562534021, metrics={'train_runtime': 103.1268, 'train_samples_per_second': 654.534, 'train_steps_per_second': 81.831, 'total_flos': 21161888098128.0, 'train_loss': 0.5193947562534021, 'epoch': 3.0})

In [32]:
trainer.evaluate()

(2500, 40, 6) (2500, 40)


{'eval_loss': 0.44708535075187683,
 'eval_precision': 0.7003907946157186,
 'eval_recall': 0.4731592842475799,
 'eval_f1': 0.5647759103641455,
 'eval_accuracy': 0.8380992982226012,
 'eval_runtime': 4.3727,
 'eval_samples_per_second': 571.73,
 'eval_steps_per_second': 71.581}

Модель явно недообучилась, поэтому разморозим все веса

In [65]:
for param in model.parameters():
    param.requires_grad = True

In [66]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [67]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [69]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3416,0.284144,0.763411,0.7577,0.760545,0.897619
2,0.2656,0.239987,0.792824,0.803755,0.798252,0.913327
3,0.2284,0.218676,0.802689,0.831769,0.81697,0.921132
4,0.2064,0.204014,0.81804,0.848636,0.833057,0.927133
5,0.1908,0.194852,0.834006,0.848929,0.841401,0.930249


(2500, 40, 6) (2500, 40)


{'BRAND': {'precision': 0.7299247176913425,
  'recall': 0.7569941444372154,
  'f1': 0.7432130309805174,
  'number': 3074},
 'GOOD': {'precision': 0.9055138493398913,
  'recall': 0.9342948717948718,
  'f1': 0.9196792428026818,
  'number': 3744},
 'overall_precision': 0.826123954049071,
 'overall_recall': 0.8543561161630977,
 'overall_f1': 0.8400028841300743,
 'overall_accuracy': 0.9295271200891979}

{'BRAND': {'precision': 0.718216044019693,
  'recall': 0.7161420733468091,
  'f1': 0.7171775592828225,
  'number': 3463},
 'GOOD': {'precision': 0.8884343036978757,
  'recall': 0.9205935105168759,
  'f1': 0.9042280589365791,
  'number': 6133},
 'overall_precision': 0.8285073409461664,
 'overall_recall': 0.846811171321384,
 'overall_f1': 0.8375592661306948,
 'overall_accuracy': 0.9353814687772972} - при lower тексте, без предварительной токенизации


 {'BRAND': {'precision': 0.7243992206105218,
  'recall': 0.7602817541467848,
  'f1': 0.7419068736141907,
  'number': 4401},
 'GOOD': {'precision': 0.8957452431289641,
  'recall': 0.9281215772179627,
  'f1': 0.9116460462614309,
  'number': 7304},
 'overall_precision': 0.8308033150077951,
 'overall_recall': 0.8650149508756941,
 'overall_f1': 0.847564038171773,
 'overall_accuracy': 0.9303750072005991} - 20 эпох без Lowera

 {'BRAND': {'precision': 0.7649894546550166,
  'recall': 0.8259596616785947,
  'f1': 0.794306272485531,
  'number': 3074},
 'GOOD': {'precision': 0.9310796645702306,
  'recall': 0.9489850427350427,
  'f1': 0.9399470899470899,
  'number': 3744},
 'overall_precision': 0.8538192011212333,
 'overall_recall': 0.8935171604576122,
 'overall_f1': 0.8732172292696911,
 'overall_accuracy': 0.9429068013379681} - rubert-tiny2 20 эпох с вычищенными лишними символами

In [None]:
model.save_pretrained('ner_bert.bin')
tokenizer.save_pretrained('ner_bert.bin')

('ner_bert.bin/tokenizer_config.json',
 'ner_bert.bin/special_tokens_map.json',
 'ner_bert.bin/vocab.txt',
 'ner_bert.bin/added_tokens.json',
 'ner_bert.bin/tokenizer.json')

# Попробуем сделать предикт

In [46]:
test_df = pd.read_csv("/content/drive/MyDrive/alpha/test_dataset.csv")

In [47]:
test_df

Unnamed: 0,id,name
0,0,"469-210 ЕРМАК Клей универсальный, 15мл, блистер"
1,1,Торт СЛАДУШКА Зимняя вишня 700г
2,2,"Смеситель ""CALORIE"" 1023 А06 д/кухни"
3,3,Лимон 50гр БАР
4,4,"Коньяк САРАДЖИШВИЛИ 5 лет 0,5л Грузия"
...,...,...
4995,4995,"774352 Рамка 2П., сл. кость"
4996,4996,Энерг. напиток Red Bull 0.25л
4997,4997,36/025 Наконечники (т. никель) шт
4998,4998,Шоколад РиттерСпорт мол.с цел.миндалем 100г


In [48]:
from transformers import pipeline


In [70]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [71]:
from tqdm.auto import tqdm

In [72]:
def get_preds(x: str) -> list:
    preds = pipe(x)
    goods = ''
    brands = ''
    for pred in preds:
        if pred['entity_group'] == 'GOOD':
            goods += pred['word'] + ','
        if pred['entity_group'] == "BRAND":
            brands += pred['word'] + ','
    return goods.rstrip(','), brands.rstrip(',')

In [73]:
test_df.name = test_df.name.apply(lambda x: (re.sub('[^A-Za-zА-Яа-я1-9]', ' ', str(x))))

test_df['good'] = test_df['name'].apply(lambda x: get_preds(x)[0].lower())
test_df['brand'] = test_df['name'].apply(lambda x: get_preds(x)[1].lower())



In [75]:
test_df

Unnamed: 0,id,name,good,brand
0,0,469 21 ЕРМАК Клей универсальный 15мл блистер,,
1,1,Торт СЛАДУШКА Зимняя вишня 7 г,торт,
2,2,Смеситель CALORIE 1 23 А 6 д кухни,смеситель,
3,3,Лимон 5 гр БАР,лимон,
4,4,Коньяк САРАДЖИШВИЛИ 5 лет 5л Грузия,коньяк,
...,...,...,...,...
4995,4995,774352 Рамка 2П сл кость,,рамка
4996,4996,Энерг напиток Red Bull 25л,напиток,red bull
4997,4997,36 25 Наконечники т никель шт,наконечники,
4998,4998,Шоколад РиттерСпорт мол с цел миндалем 1 г,шоколад,


In [76]:
test_df[50:100]

Unnamed: 0,id,name,good,brand
50,50,Болт М8х2 DIN 933 класс прочности 8 8 цинк ...,болт,
51,51,Коврик для в к CONFETTI BELLA из 1 шт 5 х57см ...,коврик,
52,52,Пиво Блю Мун розлив,пиво,
53,53,Тархун Лимонад 4л,,
54,54,Пенал 2 х ств 21 х13 мм Престижные авто с ...,пенал,
55,55,56297 Ремень крепления груза 1 2т 27У...,ремень,
56,56,1 245658 838256 ФУТБОЛКА Ж CORS TINA,,
57,57,Шоколад Tai Tau Exclusive в асс те 1 гр,шоколад,"tai,tau"
58,58,Молоко Самое любимое 3 2 95л ультрапа,,"молоко,самое"
59,59,Альфасорб пор 25г,альфасорб,


In [None]:
test_df.to_csv("rubert_tiny-2.csv", index=False)

In [None]:
from google.colab import files
files.download('/content/rubert_tiny-2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r /content/file.zip /content/ner/ner_bert.bin

  adding: content/ner/ner_bert.bin/ (stored 0%)
  adding: content/ner/ner_bert.bin/pytorch_model.bin (deflated 8%)
  adding: content/ner/ner_bert.bin/special_tokens_map.json (deflated 42%)
  adding: content/ner/ner_bert.bin/vocab.txt (deflated 52%)
  adding: content/ner/ner_bert.bin/config.json (deflated 51%)
  adding: content/ner/ner_bert.bin/tokenizer.json (deflated 70%)
  adding: content/ner/ner_bert.bin/tokenizer_config.json (deflated 44%)


In [None]:
files.download("/content/bert.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Итого, на паблик лидерборде скор пробивается 0.72