In [1]:
!pwd

/home/jovyan/minerals/gerasimov/exp_fold_training/temp


In [2]:
!nvidia-smi

Sat Oct 22 23:23:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100 Graphics D...  On   | 00000000:B7:00.0 Off |                    0 |
| N/A   36C    P0    66W / 400W |      0MiB / 81252MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# !gdown --fuzzy --folder https://drive.google.com/drive/folders/1XwjWRA6fcqfiTuxHhr88KoSjKISyI4sp?usp=sharing

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

In [5]:
from tqdm import tqdm

tqdm.pandas()

In [6]:
# !ls /home/jovyan/nltk_data/tokenizers/punkt/PY3

In [7]:
#!mv /home/jovyan/nltk_data/tokenizers/punkt/PY3/russian.pickle /home/jovyan/nltk_data/tokenizers/punkt/PY3/ru.pickle

In [8]:
import pymorphy2
import nltk
from string import punctuation

# nltk.download('punkt', download_dir="/home/jovyan/nltk_data")

punctuation = set(punctuation)
morph = pymorphy2.MorphAnalyzer()

In [9]:
def clear_text(text):
    tokens = nltk.word_tokenize(text, language="ru")
    tokens = [morph.parse(i)[0].normal_form for i in tokens if i not in punctuation]
    return " ".join(tokens)

In [10]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [11]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [12]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
contracts = pd.read_feather("contracts.feather")
goods = pd.read_feather("goods.feather")

In [None]:
goods["Название_СТЕ_source"] = goods["Название СТЕ"]
goods["Название СТЕ"] = goods["Название СТЕ"].str.lower().str.strip().progress_apply(clear_text)

In [None]:
goods["Название СТЕ"]

In [None]:
goods.to_feather("goods_2.feather")

In [None]:
def standardization_characteristics(characteristics: str):
    try:
        characteristics = eval(characteristics.lower())
        data = []
        for characteristic in characteristics:
            if "value" in characteristic:
                if len(characteristic["value"].split()) <= 3:
                    if ("value" in characteristic) and ("unit" in characteristic):
                        data.append(f"{characteristic['value']} {characteristic['unit']}")
                    elif characteristic["value"] in ["да", "нет"]:
                        if len(characteristic["name"].split()) <= 3:
                            data.append(characteristic['name'])
                    else:
                        data.append(characteristic["value"])
        data = [clear_text(i.strip()) for i in data]
        data = sorted(set(data))
        return ", ".join(data)
    except:
        return ""

In [None]:
goods["Характеристики_source"] = goods["Характеристики"]
goods["Характеристики"] = goods["Характеристики"].progress_apply(standardization_characteristics)

In [None]:
goods.to_feather("goods_3.feather")

In [16]:
selected = goods.drop_duplicates(subset=["Название СТЕ", "Код КПГЗ", "Характеристики"])
count_df = goods[["Название СТЕ", "Код КПГЗ", "Характеристики"]].drop_duplicates().groupby("Код КПГЗ").agg("count").reset_index()
classes = count_df.loc[count_df["Название СТЕ"] >= 1, "Код КПГЗ"].to_list()

In [17]:
len(classes)

5307

In [18]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(768, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [19]:
bert_cls = BertCLS(model, len(classes))

In [20]:
selected2 = selected[selected["Код КПГЗ"].isin(classes)].reset_index(drop=True)
selected2["Характеристики"] = selected2["Характеристики"].replace(np.nan, "")

In [21]:
selected2["Код КПГЗ"].value_counts()

01.13.13.01.01          9260
01.14.01.01             6906
01.11.03.01.01.99       4646
01.09.08.01.99          3885
01.11.02.02.99          3828
                        ... 
01.01.10.03.03             1
01.02.10.02.01.01          1
01.02.10.09.02.25.01       1
01.26.02.04.02.01          1
01.02.10.37.01.01.04       1
Name: Код КПГЗ, Length: 5307, dtype: int64

In [23]:
selected2

Unnamed: 0,ID СТЕ,Название СТЕ,Категория,Код КПГЗ,Характеристики,Название_СТЕ_source,Характеристики_source
0,1153097,мяч футбольный mikasa regateador5-g,Мячи футбольные,01.08.01.13.01,"5 усл. ед, mikasa, regateador5-r, белый, любит...",мяч футбольный MIKASA REGATEADOR5-G,"[{""Name"":""Модель"",""Id"":283795036,""Value"":""REGA..."
1,1153130,мяч волейбольный gala pro-line 10 fivb,Мячи волейбольные,01.08.01.14.03.01,"5, gala, pro-line 10 fivb, белый, профессионал...",мяч волейбольный Gala Pro-Line 10 FIVB,"[{""Name"":""Марка"",""Id"":284249992,""Value"":""Gala""..."
2,1153163,мяч волейбольный mikasa mva380k-obl,Мячи волейбольные,01.08.01.14.03.01,"5, mikasa, mva380k, оранжевый, синт, тренровочный",мяч волейбольный Mikasa MVA380K-OBL,"[{""Name"":""Марка"",""Id"":284249802,""Value"":""MIKAS..."
3,1153174,мяч волейбольный wilson super soft play,Мячи волейбольные,01.08.01.14.03.01,"5, super soft play, wilson, белый, любительски...",мяч волейбольный Wilson Super Soft Play,"[{""Name"":""Марка"",""Id"":284246959,""Value"":""WILSO..."
4,1159300,gutrend комплект расходный материал для fun 11...,"Расходные материалы, комплектующие для прочего...",01.20.10.99,"120 г, 170 мм, 220 мм, 50 мм, 6 шт, hepa фильт...",Gutrend комплект расходных материалов для FUN ...,"[{""Name"":""Тип"",""Id"":284280400,""Value"":""Расходн..."
...,...,...,...,...,...,...,...
354225,35228990,шина 23x10.50-12 107a8 starco as loader tl,"Шины для грузовых автомобилей и спецтехники, п...",01.09.08.10.05.02,"10 кг, 12 дюйм, 580 мм, 8 pr, starco, бескамер...",Шина 23x10.50-12 107A8 Starco AS LOADER TL,"[{""Name"":""Бескамерные"",""Id"":369460372,""Value"":..."
354226,35229086,кнопка включение рабочий освещение для минипог...,Запчасти к погрузчикам,01.09.08.03,"12 в, 650, 90 г, avant, включение рабочий осве...",Кнопка включения рабочего освещения для минипо...,"[{""Name"":""Артикул запчасти"",""Id"":369463362,""Va..."
354227,35231655,обои флизелиновый под покраска nc antivandal а...,Обои,01.11.03.11.08,"106 смотреть, 25 м, 350 г, 4 шт, 4010-16, anti...","Обои флизелиновые под покраску NC Antivandal, ...","[{""Name"":""Описание"",""Id"":369399594,""Value"":""со..."
354228,35231695,мусорный ведро kimberly-clark aquarius белый п...,Контейнеры и другие емкости для мусора пластма...,01.20.03.03.06,"2 шт, 29.00000 смотреть, 4,76 кг, 43.00000 смо...",Мусорное ведро Kimberly-Clark Aquarius белое п...,"[{""Name"":""Вид емкости"",""Id"":369380812,""Value"":..."


In [24]:
le = preprocessing.LabelEncoder()
selected2["target"] = le.fit_transform(selected2["Код КПГЗ"])

In [25]:
X_train, X_test = train_test_split(selected2[["Название СТЕ", "target"]].values,
                                   test_size=0.01,
                                   random_state=42,
                                   shuffle=True)

In [26]:
len(X_test), len(X_train)

(3543, 350687)

In [27]:
# X_train, X_test = X_train[:10000], X_test[:10000]

In [31]:
selected2["Название СТЕ"].str.len().mean()

47.34803940942326

In [32]:
class ClassificationDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text, target = self.data[idx]
        return text, target

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    model_target = []
    for text, target in batch:
        model_input.append(text)
        model_target.append(target)

    tok = tokenizer(model_input, padding=True,
                    max_length=200, truncation=True,
                    return_tensors='pt')
    return tok, torch.tensor(model_target).long()

In [33]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn
    )
    return loader

In [34]:
batch_size = 200
train_dataset = ClassificationDataset(X_train)
train_loader = get_loader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = ClassificationDataset(X_test)
test_loader = get_loader(test_dataset, shuffle=False, batch_size=batch_size)

In [35]:
set_seed(42)

bert_cls = bert_cls.to(device)

num_epochs = 5
gradient_accumulation_steps = 1

total_steps = (len(train_loader) * num_epochs)
optimizer = optim.AdamW(bert_cls.parameters(), lr=2e-4)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-4,
                                          total_steps=total_steps,
                                          div_factor=25,
                                          pct_start=0.1)
loss_func = torch.nn.CrossEntropyLoss()

In [36]:
def train(model, loss_func, device, train_loader, optimizer, epoch, gradient_accumulation_steps, scheduler):
    model.train()
    train_loader_length = len(train_loader) - 1 # хотим знать когда эпоха закончится
    pbar = tgdm_tg(train_loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
    for batch_idx, (data, labels) in enumerate(pbar):
        data, labels = data.to(device), labels.to(device)
        
        t = time.time()
        embeddings = model(data)
        t2 = time.time() - t
        
        loss = loss_func(embeddings, labels) / gradient_accumulation_steps
        
        tot_m, used_m, free_m = map(int, os.popen('free -t -m').readlines()[-1].split()[1:])
        pbar.set_description(f"{print_gpu_utilization()} Used RAM: {used_m} Free RAM: {free_m} Loss Train: {float(loss) * gradient_accumulation_steps} Time: {t2}")
        
        loss.backward()
        
        if (batch_idx % gradient_accumulation_steps == 0) or (batch_idx == train_loader_length):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # ставим модель в рамки, помогает
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        
        del data, labels, embeddings, loss
    return model

In [37]:
def test(model, loader, device):
    y_true = []
    pred = []
    model.eval()
    with torch.no_grad():
        pbar = tgdm_tg(loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
        for batch_idx, (data, labels) in enumerate(pbar):
            data = data.to(device)
            embeddings = model(data)
            pred.extend(embeddings.argmax(-1).detach().cpu().numpy())
            y_true.extend(labels.detach().cpu().numpy())

    return y_true, pred

In [38]:
def metrics(y_true, pred):
    print("matthews_corrcoef:", matthews_corrcoef(y_true, pred))
    print("accuracy_score:", accuracy_score(y_true, pred))
    print("f1_score:", f1_score(y_true, pred, average='weighted'))

In [39]:
for x in bert_cls.parameters(): x.requires_grad = True

In [40]:
count_parameters(bert_cls)

+----------------------------------------------------------+------------+
|                         Modules                          | Parameters |
+----------------------------------------------------------+------------+
|         model.embeddings.word_embeddings.weight          |  42303744  |
|       model.embeddings.position_embeddings.weight        |   393216   |
|      model.embeddings.token_type_embeddings.weight       |    1536    |
|            model.embeddings.LayerNorm.weight             |    768     |
|             model.embeddings.LayerNorm.bias              |    768     |
|    model.encoder.layer.0.attention.self.query.weight     |   589824   |
|     model.encoder.layer.0.attention.self.query.bias      |    768     |
|     model.encoder.layer.0.attention.self.key.weight      |   589824   |
|      model.encoder.layer.0.attention.self.key.bias       |    768     |
|    model.encoder.layer.0.attention.self.value.weight     |   589824   |
|     model.encoder.layer.0.attention.

In [41]:
!mkdir 'models-all-classes-lem'
for epoch in range(1, num_epochs + 1):
    bert_cls.train()
    
    clear_cache()
    train(bert_cls, loss_func, device, train_loader, optimizer,
          epoch, gradient_accumulation_steps, scheduler)
    PATH = f"./models-all-classes-lem/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    torch.save(bert_cls.state_dict(), PATH)
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

mkdir: cannot create directory ‘models-all-classes-lem’: File exists


  0%|          | 0/1754 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.6014122430103136
accuracy_score: 0.6025966694891335
f1_score: 0.564212601147746


  0%|          | 0/1754 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.675013354653951
accuracy_score: 0.6759808072255151
f1_score: 0.6514760004775421


  0%|          | 0/1754 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.709304161379239
accuracy_score: 0.7101326559412927
f1_score: 0.6924522672086093


  0%|          | 0/1754 [00:00<?, ?it/s]

  self.message_id


  0%|          | 0/18 [00:00<?, ?it/s]

  message_id = self.message_id


matthews_corrcoef: 0.7356081092906664
accuracy_score: 0.7363815975162292
f1_score: 0.7237818990049587


  self.message_id


  0%|          | 0/1754 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/18 [00:00<?, ?it/s]

matthews_corrcoef: 0.7378638659375621
accuracy_score: 0.738639570985041
f1_score: 0.7249976577283143


In [None]:
!ls models

In [None]:
# for epoch in range(1, num_epochs + 1):
#     PATH = f"./models-all-classes/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
#     bert_cls.load_state_dict(torch.load(PATH))
    
#     clear_cache()
#     y_true, pred = test(bert_cls, test_loader, device)
#     metrics(y_true, pred)

In [42]:
selected2.to_feather("preprocess_all_columns-lem.feather")

In [None]:
selected2