In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel, BertModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [5]:
clear_cache()

In [6]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "outputs/checkpoint-7496-epoch-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
contracts = pd.read_feather("data/contracts.feather")
goods = pd.read_feather("data/goods.feather")

In [9]:
goods["Название_СТЕ_source"] = goods["Название СТЕ"]
goods["Название СТЕ"] = goods["Название СТЕ"].str.lower().str.strip()

In [10]:
def standardization_characteristics(characteristics: str):
    try:
        characteristics = eval(characteristics.lower())
        data = []
        for characteristic in characteristics:
            if "value" in characteristic:
                if len(characteristic["value"].split()) <= 3:
                    if ("value" in characteristic) and ("unit" in characteristic):
                        data.append(f"{characteristic['value']} {characteristic['unit']}")
                    elif characteristic["value"] in ["да", "нет"]:
                        if len(characteristic["name"].split()) <= 3:
                            data.append(characteristic['name'])
                    else:
                        data.append(characteristic["value"])
        data = [i.strip() for i in data]
        data = sorted(set(data))
        return ", ".join(data)
    except:
        return ""

In [11]:
goods["Характеристики_source"] = goods["Характеристики"]
goods["Характеристики"] = goods["Характеристики"].apply(standardization_characteristics)

In [12]:
selected = goods[["Название СТЕ", "Код КПГЗ", "Характеристики", "Название_СТЕ_source", "Характеристики_source"]].drop_duplicates(subset=["Название СТЕ", "Код КПГЗ", "Характеристики"])
count_df = goods[["Название СТЕ", "Код КПГЗ", "Характеристики"]].drop_duplicates().groupby("Код КПГЗ").agg("count").reset_index()
classes = count_df.loc[count_df["Название СТЕ"] >= 1, "Код КПГЗ"].to_list()

In [13]:
len(classes)

5307

In [14]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(768, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [16]:

model_name = "outputs/checkpoint-7496-epoch-2"
model = BertModel.from_pretrained(
            model_name, 
            ignore_mismatched_sizes=True, 
            num_labels=len(classes)
        )
bert_cls = BertCLS(model, n_classes=len(classes))

Some weights of the model checkpoint at outputs/checkpoint-7496-epoch-2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at outputs/checkpoint-7496-epoch-2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

In [17]:
selected2 = selected[selected["Код КПГЗ"].isin(classes)].reset_index(drop=True)
selected2["Характеристики"] = selected2["Характеристики"].replace(np.nan, "")
selected2["text"] = selected2["Название СТЕ"].str.strip().str.lower() # + " [SEP] " + selected2["Характеристики"].str.strip().str.lower()

In [18]:
selected2["Код КПГЗ"].value_counts()

01.13.13.01.01          9262
01.14.01.01             6906
01.11.03.01.01.99       4646
01.09.08.01.99          3886
01.11.02.02.99          3829
                        ... 
01.01.10.03.03             1
01.02.10.02.01.01          1
01.02.10.09.02.25.01       1
01.26.02.04.02.01          1
01.02.10.37.01.01.04       1
Name: Код КПГЗ, Length: 5307, dtype: int64

In [19]:
selected2

Unnamed: 0,Название СТЕ,Код КПГЗ,Характеристики,Название_СТЕ_source,Характеристики_source,text
0,мяч футбольный mikasa regateador5-g,01.08.01.13.01,"5 усл. ед, mikasa, regateador5-r, белый, любит...",мяч футбольный MIKASA REGATEADOR5-G,"[{""Name"":""Модель"",""Id"":283795036,""Value"":""REGA...",мяч футбольный mikasa regateador5-g
1,мяч волейбольный gala pro-line 10 fivb,01.08.01.14.03.01,"5, gala, pro-line 10 fivb, белый, профессионал...",мяч волейбольный Gala Pro-Line 10 FIVB,"[{""Name"":""Марка"",""Id"":284249992,""Value"":""Gala""...",мяч волейбольный gala pro-line 10 fivb
2,мяч волейбольный mikasa mva380k-obl,01.08.01.14.03.01,"5, mikasa, mva380k, оранжевый, синт, тренровочный",мяч волейбольный Mikasa MVA380K-OBL,"[{""Name"":""Марка"",""Id"":284249802,""Value"":""MIKAS...",мяч волейбольный mikasa mva380k-obl
3,мяч волейбольный wilson super soft play,01.08.01.14.03.01,"5, super soft play, wilson, белый, любительски...",мяч волейбольный Wilson Super Soft Play,"[{""Name"":""Марка"",""Id"":284246959,""Value"":""WILSO...",мяч волейбольный wilson super soft play
4,gutrend комплект расходных материалов для fun ...,01.20.10.99,"120 г, 170 мм, 220 мм, 50 мм, 6 шт, hepa фильт...",Gutrend комплект расходных материалов для FUN ...,"[{""Name"":""Тип"",""Id"":284280400,""Value"":""Расходн...",gutrend комплект расходных материалов для fun ...
...,...,...,...,...,...,...
354367,шина 23x10.50-12 107a8 starco as loader tl,01.09.08.10.05.02,"10 кг, 12 дюйм, 580 мм, 8 pr, starco, бескамер...",Шина 23x10.50-12 107A8 Starco AS LOADER TL,"[{""Name"":""Бескамерные"",""Id"":369460372,""Value"":...",шина 23x10.50-12 107a8 starco as loader tl
354368,кнопка включения рабочего освещения для минипо...,01.09.08.03,"12 в, 650, 90 г, avant, включение рабочего осв...",Кнопка включения рабочего освещения для минипо...,"[{""Name"":""Артикул запчасти"",""Id"":369463362,""Va...",кнопка включения рабочего освещения для минипо...
354369,"обои флизелиновые под покраску nc antivandal, ...",01.11.03.11.08,"106 см, 25 м, 350 г, 4 шт, 4010-16, antivandal...","Обои флизелиновые под покраску NC Antivandal, ...","[{""Name"":""Описание"",""Id"":369399594,""Value"":""со...","обои флизелиновые под покраску nc antivandal, ..."
354370,мусорное ведро kimberly-clark aquarius белое п...,01.20.03.03.06,"2 шт, 29.00000 см, 4,76 кг, 43.00000 см, 57.00...",Мусорное ведро Kimberly-Clark Aquarius белое п...,"[{""Name"":""Вид емкости"",""Id"":369380812,""Value"":...",мусорное ведро kimberly-clark aquarius белое п...


In [20]:
le = preprocessing.LabelEncoder()
selected2["target"] = le.fit_transform(selected2["Код КПГЗ"])

In [21]:
X_train, X_test = train_test_split(selected2[["text", "target"]].values,
                                   test_size=0.01,
                                   random_state=42,
                                   shuffle=True)

In [22]:
len(X_test), len(X_train)

(3544, 350828)

In [23]:
# X_train, X_test = X_train[:10000], X_test[:10000]

In [24]:
selected2["text"].str.len().mean()

48.40319776957548

In [25]:
class ClassificationDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text, target = self.data[idx]
        return text, target

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    model_target = []
    for text, target in batch:
        model_input.append(text)
        model_target.append(target)

    tok = tokenizer(model_input, padding=True,
                    max_length=200, truncation=True,
                    return_tensors='pt')
    return tok, torch.tensor(model_target).long()

In [26]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn
    )
    return loader

In [27]:
batch_size = 128
train_dataset = ClassificationDataset(X_train)
train_loader = get_loader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = ClassificationDataset(X_test)
test_loader = get_loader(test_dataset, shuffle=False, batch_size=batch_size)

In [28]:
set_seed(42)

bert_cls = bert_cls.to(device)

num_epochs = 5
gradient_accumulation_steps = 1

total_steps = (len(train_loader) * num_epochs)
optimizer = optim.AdamW(bert_cls.parameters(), lr=2e-4)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-4,
                                          total_steps=total_steps,
                                          div_factor=25,
                                          pct_start=0.1)
loss_func = torch.nn.CrossEntropyLoss()

In [29]:
def train(model, loss_func, device, train_loader, optimizer, epoch, gradient_accumulation_steps, scheduler):
    model.train()
    train_loader_length = len(train_loader) - 1 # хотим знать когда эпоха закончится
    pbar = tgdm_tg(train_loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
    for batch_idx, (data, labels) in enumerate(pbar):
        data, labels = data.to(device), labels.to(device)
        
        t = time.time()
        embeddings = model(data)
        t2 = time.time() - t
        
        loss = loss_func(embeddings, labels) / gradient_accumulation_steps
        
        tot_m, used_m, free_m = map(int, os.popen('free -t -m').readlines()[-1].split()[1:])
        pbar.set_description(f"{print_gpu_utilization()} Used RAM: {used_m} Free RAM: {free_m} Loss Train: {float(loss) * gradient_accumulation_steps} Time: {t2}")
        
        loss.backward()
        
        if (batch_idx % gradient_accumulation_steps == 0) or (batch_idx == train_loader_length):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # ставим модель в рамки, помогает
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        
        del data, labels, embeddings, loss
    return model

In [30]:
def test(model, loader, device):
    y_true = []
    pred = []
    model.eval()
    with torch.no_grad():
        pbar = tgdm_tg(loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
        for batch_idx, (data, labels) in enumerate(pbar):
            data = data.to(device)
            embeddings = model(data)
            pred.extend(embeddings.argmax(-1).detach().cpu().numpy())
            y_true.extend(labels.detach().cpu().numpy())

    return y_true, pred

In [31]:
def metrics(y_true, pred):
    print("matthews_corrcoef:", matthews_corrcoef(y_true, pred))
    print("accuracy_score:", accuracy_score(y_true, pred))
    print("f1_score:", f1_score(y_true, pred, average='weighted'))

In [32]:
for x in bert_cls.parameters(): x.requires_grad = True

In [33]:
count_parameters(bert_cls)

+----------------------------------------------------------+------------+
|                         Modules                          | Parameters |
+----------------------------------------------------------+------------+
|         model.embeddings.word_embeddings.weight          |  42303744  |
|       model.embeddings.position_embeddings.weight        |   393216   |
|      model.embeddings.token_type_embeddings.weight       |    1536    |
|            model.embeddings.LayerNorm.weight             |    768     |
|             model.embeddings.LayerNorm.bias              |    768     |
|    model.encoder.layer.0.attention.self.query.weight     |   589824   |
|     model.encoder.layer.0.attention.self.query.bias      |    768     |
|     model.encoder.layer.0.attention.self.key.weight      |   589824   |
|      model.encoder.layer.0.attention.self.key.bias       |    768     |
|    model.encoder.layer.0.attention.self.value.weight     |   589824   |
|     model.encoder.layer.0.attention.

In [34]:
!mkdir 'models-all-classes'
for epoch in range(1, num_epochs + 1):
    bert_cls.train()
    
    clear_cache()
    train(bert_cls, loss_func, device, train_loader, optimizer,
          epoch, gradient_accumulation_steps, scheduler)
    PATH = f"./models-all-classes/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    torch.save(bert_cls.state_dict(), PATH)
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

  self.message_id


  0%|          | 0/2741 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/28 [00:00<?, ?it/s]

matthews_corrcoef: 0.6122059714836954
accuracy_score: 0.613431151241535
f1_score: 0.5791454387279265


  self.message_id


  0%|          | 0/2741 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/28 [00:00<?, ?it/s]

matthews_corrcoef: 0.660968119576843
accuracy_score: 0.6619638826185101
f1_score: 0.6429023598383253


  self.message_id


  0%|          | 0/2741 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/28 [00:00<?, ?it/s]

matthews_corrcoef: 0.7059325758329084
accuracy_score: 0.7068284424379232
f1_score: 0.6915778077337997


  0%|          | 0/2741 [00:00<?, ?it/s]

  self.message_id


  0%|          | 0/28 [00:00<?, ?it/s]

  message_id = self.message_id


matthews_corrcoef: 0.7291363385238504
accuracy_score: 0.7299661399548533
f1_score: 0.7180479724868759


  self.message_id


  0%|          | 0/2741 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/28 [00:00<?, ?it/s]

matthews_corrcoef: 0.7390330677241878
accuracy_score: 0.739841986455982
f1_score: 0.7286397113753839


In [None]:
!ls models

In [65]:
for epoch in range(1, num_epochs + 1):
    PATH = f"./models-all-classes/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    bert_cls.load_state_dict(torch.load(PATH))
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.771881612975255
accuracy_score: 0.7725733634311512
f1_score: 0.7508478393125875


  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.8335723984101997
accuracy_score: 0.8340857787810384
f1_score: 0.8216926411370618


  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.8703658720837022
accuracy_score: 0.8707674943566591
f1_score: 0.8638951893594008


  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.8921547536166197
accuracy_score: 0.8924943566591422
f1_score: 0.8882467823768128


  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.8966834027124371
accuracy_score: 0.8970090293453724
f1_score: 0.8936225594372551


In [100]:
selected2.to_feather("preprocess.feather")

In [99]:
selected2

Unnamed: 0,Название СТЕ,Код КПГЗ,Характеристики,Название_СТЕ_source,Характеристики_source,text
0,мяч футбольный mikasa regateador5-g,01.08.01.13.01,"5 усл. ед, mikasa, regateador5-r, белый, любит...",мяч футбольный MIKASA REGATEADOR5-G,"[{""Name"":""Модель"",""Id"":283795036,""Value"":""REGA...",мяч футбольный mikasa regateador5-g [SEP] 5 ус...
1,мяч волейбольный gala pro-line 10 fivb,01.08.01.14.03.01,"5, gala, pro-line 10 fivb, белый, профессионал...",мяч волейбольный Gala Pro-Line 10 FIVB,"[{""Name"":""Марка"",""Id"":284249992,""Value"":""Gala""...",мяч волейбольный gala pro-line 10 fivb [SEP] 5...
2,мяч волейбольный mikasa mva380k-obl,01.08.01.14.03.01,"5, mikasa, mva380k, оранжевый, синт, тренровочный",мяч волейбольный Mikasa MVA380K-OBL,"[{""Name"":""Марка"",""Id"":284249802,""Value"":""MIKAS...","мяч волейбольный mikasa mva380k-obl [SEP] 5, m..."
3,мяч волейбольный wilson super soft play,01.08.01.14.03.01,"5, super soft play, wilson, белый, любительски...",мяч волейбольный Wilson Super Soft Play,"[{""Name"":""Марка"",""Id"":284246959,""Value"":""WILSO...",мяч волейбольный wilson super soft play [SEP] ...
4,gutrend комплект расходных материалов для fun ...,01.20.10.99,"120 г, 170 мм, 220 мм, 50 мм, 6 шт, hepa фильт...",Gutrend комплект расходных материалов для FUN ...,"[{""Name"":""Тип"",""Id"":284280400,""Value"":""Расходн...",gutrend комплект расходных материалов для fun ...
...,...,...,...,...,...,...
354367,шина 23x10.50-12 107a8 starco as loader tl,01.09.08.10.05.02,"10 кг, 12 дюйм, 580 мм, 8 pr, starco, бескамер...",Шина 23x10.50-12 107A8 Starco AS LOADER TL,"[{""Name"":""Бескамерные"",""Id"":369460372,""Value"":...",шина 23x10.50-12 107a8 starco as loader tl [SE...
354368,кнопка включения рабочего освещения для минипо...,01.09.08.03,"12 в, 650, 90 г, avant, включение рабочего осв...",Кнопка включения рабочего освещения для минипо...,"[{""Name"":""Артикул запчасти"",""Id"":369463362,""Va...",кнопка включения рабочего освещения для минипо...
354369,"обои флизелиновые под покраску nc antivandal, ...",01.11.03.11.08,"106 см, 25 м, 350 г, 4 шт, 4010-16, antivandal...","Обои флизелиновые под покраску NC Antivandal, ...","[{""Name"":""Описание"",""Id"":369399594,""Value"":""со...","обои флизелиновые под покраску nc antivandal, ..."
354370,мусорное ведро kimberly-clark aquarius белое п...,01.20.03.03.06,"2 шт, 29.00000 см, 4,76 кг, 43.00000 см, 57.00...",Мусорное ведро Kimberly-Clark Aquarius белое п...,"[{""Name"":""Вид емкости"",""Id"":369380812,""Value"":...",мусорное ведро kimberly-clark aquarius белое п...
