# Baseline

в бейзлайне было минимальное количество изменений, в основном небольшое изменение архитектуры модели и подбор гиперпараметров

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.0.5-py3-none-any.whl (722 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/722.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m722.4/722.4 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.0.1-py3-none-any.whl (729 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/729.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m729.2/729.2 kB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.9.0 pytorch_lightning-2.0.5 torchmetrics-1.0.1


In [3]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16165 sha256=05881df928f144545734df254d981e7913baf3abfd13b11ad81db37cf25ad121
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
from gensim.models.fasttext import FastText
import pandas as pd
import pytorch_lightning as pl
from seqeval.metrics.sequence_labeling import get_entities
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

torch.set_float32_matmul_precision("high")

# Utils

Полезные функции для работы с BIO-тегами

In [5]:

def apply_bio_tagging(row):
    """
    По токенам чека и разметке (то есть выделенным товарам и брендам) строим BIO-теги
    """
    tokens = row["tokens"]
    good = row["good"].split(',')[0].split()
    brand = row["brand"].split(',')[0].split()
    tags = ['O'] * len(tokens)
    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
    return tags

Прямое и обратное преобразование тегов в индексы

In [6]:
index_to_tag = ["O", "B-GOOD", "I-GOOD", "B-BRAND", "I-BRAND", "PAD"]
tag_to_index = {tag: index for index, tag in enumerate(index_to_tag)}
tag_to_index

{'O': 0, 'B-GOOD': 1, 'I-GOOD': 2, 'B-BRAND': 3, 'I-BRAND': 4, 'PAD': 5}

# Datamodule

Подготовим данные для модели. Для этого определим наследника `torch.nn.utils.Dataset` - `ReceiptsDataset`

In [7]:
import numpy as np

In [8]:
class ReceiptsDataset(Dataset):
    def __init__(self, df, wv_model):
        super().__init__()
        self.is_predict = "tags" not in df.columns
        self.data = df[["tokens", "good", "brand", "tags"]] if not self.is_predict else df[["tokens", "id"]]
        self.data = self.data.values
        self.wv_model = wv_model

    def __getitem__(self, index):
        identifier = 0 if not self.is_predict else self.data[index][1]
        tokens = self.data[index][0]
        embeddings = []
        for token in tokens:
          try:
              embeddings.append(self.wv_model.wv[token])
          except:
              embeddings.append(np.zeros(shape = (300), dtype = float)) #если w2v не знает слова, то он заменяет его на нулевой вектор
        embeddings = np.array(embeddings, dtype = float)
        goods = self.data[index][1].split(',') if not self.is_predict else list()
        brands = self.data[index][2].split(',') if not self.is_predict else list()
        tags = self.data[index][3] if not self.is_predict else ["O"] * len(tokens)
        target = [tag_to_index[tag] for tag in tags]
        return identifier, tokens, embeddings, goods, brands, target

    def __len__(self):
        return len(self.data)

Для объединения примеров в батчи нужна специальная `collate_fn`, в которой происходит паддинг

In [9]:
def collate_fn(batch):
    ids, tokens_sequence, embeddings_sequence, goods, brands, targets = list(zip(*batch))
    ''' embeddings_sequence - батч типа tuple с размером 512

    '''
    embeddings_sequence = pad_sequence([torch.FloatTensor(sequence) for sequence in embeddings_sequence],
                                       batch_first=True)
    targets = pad_sequence([torch.LongTensor(target) for target in targets], batch_first=True,
                           padding_value=tag_to_index["PAD"])
    return ids, tokens_sequence, embeddings_sequence, goods, brands, targets

Используем LightningDataModule для задания пайплайна

1. prepare_data
    1. Токенизируем текст
    2. Выделяем BIO-теги в размеченной части
2. setup
    1. Разделяем размеченную выборку на обучающую и валидационную
    2. Создаем `ReceiptsDataset` под каждую выборку

In [10]:
import re
from gensim.models import Word2Vec

In [62]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_path,
                 test_dataset_path,
                 fasttext_path,
                 val_split_size,
                 batch_size,
                 num_workers):
        super().__init__()
        self.train_dataset_path = train_dataset_path
        self.test_dataset_path = test_dataset_path
        self.fasttext_path = fasttext_path
        self.val_split_size = val_split_size
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        self.gensim = Word2Vec.load(self.fasttext_path)
        self.train_df = pd.read_csv(self.train_dataset_path).fillna("")
        self.test_df = pd.read_csv(self.test_dataset_path)

        shape = self.train_df[self.train_df['brand'] == ''].shape[0]
        self.train_df.drop(index = self.train_df[self.train_df['brand'] == ''].index[:int(shape*0.6)],
                           inplace = True) #удаляем 60% брендов с длиной равной 0
        self.train_df["tokens"] = self.train_df["name"].str.lower().str.split()
        self.test_df["tokens"] = self.test_df["name"].str.lower().str.split()

        self.train_df["tags"] = self.train_df.apply(apply_bio_tagging, axis=1)

    def setup(self, stage: str):
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size,
                                                      shuffle = True)
        self.train_dataset = ReceiptsDataset(self.train_df, self.gensim)
        self.val_dataset = ReceiptsDataset(self.val_df, self.gensim)
        self.predict_dataset = ReceiptsDataset(self.test_df, self.gensim)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn, shuffle = True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn, shuffle = True)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           collate_fn=collate_fn)

In [25]:
TRAIN_DATASET_PATH = "/content/drive/MyDrive/alpha/train_supervised_dataset.csv"
TEST_DATASET_PATH = "/content/drive/MyDrive/alpha/test_dataset.csv"
WORD2VEC_PATH = "/content/drive/MyDrive/alpha/word2vec.model"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 512
NUM_WORKERS = 5

In [26]:
dm = ReceiptsDataModule(
    TRAIN_DATASET_PATH,
    TEST_DATASET_PATH,
    WORD2VEC_PATH,
    VAL_SPLIT_SIZE,
    BATCH_SIZE,
    NUM_WORKERS
)

# Model

Сначала определим метрику `F1` для задачи NER

In [27]:
class F1Score:
    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def update(self, pred, target):
        pred = frozenset(x for x in pred)
        target = frozenset(x for x in target)
        self.tp += len(pred & target)
        self.fp += len(pred - target)
        self.fn += len(target - pred)

    def reset(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def get(self):
        if self.tp == 0:
            return 0.0
        precision = self.tp / (self.tp + self.fp)
        recall = self.tp / (self.tp + self.fn)
        return 2 / (1 / precision + 1 / recall)

Зададим саму модель, ее шаги на обучении, валидации и инференсе, а также способ обучения

In [28]:
class ReceiptsModule(pl.LightningModule):
    def __init__(self,
                 rnn_input_size,
                 rnn_hidden_size,
                 rnn_num_layers,
                 rnn_dropout,
                 mlp_hidden_size,
                 learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.rnn = nn.RNN(input_size=rnn_input_size, # вместо RNN была использована biRNN
                           hidden_size=rnn_hidden_size,
                           num_layers=rnn_num_layers,
                           batch_first=True,
                           dropout=rnn_dropout,
                          bidirectional=True
                          )
        self.mlp = nn.Sequential(
            nn.Linear(2*rnn_hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, len(index_to_tag))
        )
        self.criterion = nn.CrossEntropyLoss(ignore_index=tag_to_index["PAD"], reduction="mean")
        self.f1_good_train = F1Score()
        self.f1_brand_train = F1Score()
        self.f1_good_val = F1Score()
        self.f1_brand_val = F1Score()

    def forward(self, sequences):
        sequences, hidden = self.rnn(sequences)
        logits = self.mlp(sequences)
        return logits

    def training_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = self.criterion(logits.transpose(1, 2), targets)
        tags_indices_sequence = torch.argmax(logits, dim=-1).detach().cpu().numpy().tolist()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_train.update(goods_pred, goods[i])
            self.f1_brand_train.update(brands_pred, brands[i])
        self.log("loss/train", loss, on_epoch=True, batch_size=len(tags_indices_sequence))
        return loss

    def on_train_epoch_end(self):
        self.log("metric/f1_good_train", self.f1_good_train.get())
        self.log("metric/f1_brand_train", self.f1_brand_train.get())
        self.f1_good_train.reset()
        self.f1_brand_train.reset()



    def validation_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = self.criterion(logits.transpose(1, 2), targets)
        tags_indices_sequence = torch.argmax(logits, dim=-1).detach().cpu().numpy().tolist()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_val.update(goods_pred, goods[i])
            self.f1_brand_val.update(brands_pred, brands[i])
        self.log("loss/val", loss, batch_size=len(tags_indices_sequence),prog_bar=True)

    def on_validation_epoch_end(self):
        self.log("metric/f1_good_val", self.f1_good_val.get())
        self.log("metric/f1_brand_val", self.f1_brand_val.get())
        self.log("general metric", (self.f1_good_val.get() + self.f1_brand_val.get()*2)/3)
        self.f1_good_val.reset()
        self.f1_brand_val.reset()

    def predict_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, _, _, _ = batch
        logits = self(embeddings_sequence)
        tags_indices_sequence = torch.argmax(logits, dim=-1).detach().cpu().numpy().tolist()
        result = list()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"])
            brands_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"])
            result.append([ids[i], goods_pred, brands_pred])
        return result

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), self.learning_rate)

In [41]:
RNN_INPUT_SIZE = 300
RNN_HIDDEN_SIZE = 300
RNN_NUM_LAYERS = 4
RNN_DROPOUT = 0.1
MLP_HIDDEN_SIZE = 500
LEARNING_RATE = 1e-3
model = ReceiptsModule(
    RNN_INPUT_SIZE,
    RNN_HIDDEN_SIZE,
    RNN_NUM_LAYERS,
    RNN_DROPOUT,
    MLP_HIDDEN_SIZE,
    LEARNING_RATE
)

In [42]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="ner_rnn_baseline"),
    max_epochs=15,
    log_every_n_steps=1
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Обучение модели

In [43]:
trainer.fit(model, datamodule=dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | rnn       | RNN              | 1.4 M 
1 | mlp       | Sequential       | 804 K 
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.992     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [44]:
trainer.validate(model, datamodule=dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'loss/val': 0.3239626884460449,
  'metric/f1_good_val': 0.7700950503349304,
  'metric/f1_brand_val': 0.12117039412260056,
  'general metric': 0.3374786078929901}]

1 сабмит│      general metric       │    0.20343613624572754    │
│         loss/val          │    0.5330078601837158     │
│    metric/f1_brand_val    │            0.0            │
│    metric/f1_good_val     │    0.6103084087371826     │

 3 сабмит :
 [{'loss/val': 0.5129594206809998,
  'metric/f1_good_val': 0.6602585315704346,
  'metric/f1_brand_val': 0.11011116951704025,
  'general metric': 0.2934936285018921}]

  4 сабмит:
  [{'loss/val': 0.4051026403903961,
  'metric/f1_good_val': 0.7053451538085938,
  'metric/f1_brand_val': 0.17465028166770935,
  'general metric': 0.35154858231544495}] поменял lr на 1e-3 и убрал 0.6 %
  пустых брендов из трэйна, делал 15 эпох

  5 сабмит:
  [{'loss/val': 0.3605898916721344,
  'metric/f1_good_val': 0.732370913028717,
  'metric/f1_brand_val': 0.301075279712677,
  'general metric': 0.4448404908180237}]
  заменил rnn на bidirectional rnn, скор подрос на 0.2 на паблике

странно, что при использовании полного датасета, не вырезая часть товаров без брендов, модель выдает скор на валидации хуже. Предполагаю, что она просто видит недостаточно товаров с брендами, поэтому скор f1_brand растёт хуже.

Не сказать, что фильтрация текста дала значительный прирост. Скорее всего нужно было нормализовывать слова, как предлагали авторы, потому что в чеках очень много сокращений.

Получение итоговых сущностей для тестового датасета

In [None]:
pred = trainer.predict(model, datamodule=dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [None]:
submission = pd.DataFrame(sum(pred, list()), columns=["id", "good", "brand"])
submission

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,
3,3,,
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,риттерспорт


In [None]:
submission.brand.value_counts()

                   2667
футболк              11
святой источник      10
нескафе              10
винт                 10
                   ... 
cm                    1
darbo                 1
витамин               1
дымдым                1
риттерспорт           1
Name: brand, Length: 1861, dtype: int64

In [None]:
submission.to_csv("submission_5.csv", index=False)

А теперь посмотрим какой скор будет если обучать модель на всех датасетах

In [None]:
from gensim.models import Word2Vec

In [64]:
TRAIN_DATASET_PATH = "/content/drive/MyDrive/alpha/train_supervised_dataset.csv"
TEST_DATASET_PATH = "/content/drive/MyDrive/alpha/test_dataset.csv"
WV_BIG_PATH = "/content/drive/MyDrive/alpha/big_word2vec.model"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 512
NUM_WORKERS = 5
big_wv_module = ReceiptsDataModule(
    TRAIN_DATASET_PATH,
    TEST_DATASET_PATH,
    WV_BIG_PATH,
    VAL_SPLIT_SIZE,
    BATCH_SIZE,
    NUM_WORKERS
)

In [65]:
RNN_INPUT_SIZE = 300
RNN_HIDDEN_SIZE = 300
RNN_NUM_LAYERS = 4
RNN_DROPOUT = 0.1
MLP_HIDDEN_SIZE = 500
LEARNING_RATE = 1e-3
big_wv_model = ReceiptsModule(
    RNN_INPUT_SIZE,
    RNN_HIDDEN_SIZE,
    RNN_NUM_LAYERS,
    RNN_DROPOUT,
    MLP_HIDDEN_SIZE,
    LEARNING_RATE
)

In [66]:
big_wv_trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="ner_rnn_baseline"),
    max_epochs=15,
    log_every_n_steps=1
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [67]:
big_wv_trainer.fit(big_wv_model, datamodule=big_wv_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | rnn       | RNN              | 2.0 M 
1 | mlp       | Sequential       | 804 K 
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
2.8 M     Trainable params
0         Non-trainable params
2.8 M     Total params
11.157    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [68]:
big_wv_trainer.validate(big_wv_model, datamodule=big_wv_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'loss/val': 0.16101084649562836,
  'metric/f1_good_val': 0.8555946946144104,
  'metric/f1_brand_val': 0.5417814254760742,
  'general metric': 0.6463858485221863}]

[{'loss/val': 0.13999679684638977,
  'metric/f1_good_val': 0.8679407238960266,
  'metric/f1_brand_val': 0.4592178761959076,
  'general metric': 0.5954588055610657}] - совершенно другой уровень

In [70]:
pred = big_wv_trainer.predict(big_wv_model, datamodule=big_wv_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [71]:
submission = pd.DataFrame(sum(pred, list()), columns=["id", "good", "brand"])
submission

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,
3,3,,
4,4,коньяк,
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,риттерспорт


In [None]:
submission.to_csv("gensim_baseline.csv", index=False)