In [1]:
%%capture
!pip install transformers

In [2]:
%%capture
!pip install gdown
!gdown --fuzzy https://drive.google.com/file/d/14LOIs_rHMtNHkb8uOaEfSoEjydFFOpgr/view?usp=sharing
!unzip -u 'data (1).zip'

In [3]:
!gdown --fuzzy https://drive.google.com/file/d/1GUFQrFoxknqfT9ux2a0AN5fcJrl1y_6y/view?usp=sharing
!unzip checkpoint-560360-epoch-10.zip

Downloading...
From: https://drive.google.com/uc?id=1GUFQrFoxknqfT9ux2a0AN5fcJrl1y_6y
To: /kaggle/working/checkpoint-560360-epoch-10.zip
100%|█████████████████████████████████████████| 325M/325M [00:01<00:00, 307MB/s]
Archive:  checkpoint-560360-epoch-10.zip
  inflating: checkpoint-560360-epoch-10/config.json  
  inflating: checkpoint-560360-epoch-10/pytorch_model.bin  
  inflating: checkpoint-560360-epoch-10/tokenizer_config.json  
  inflating: checkpoint-560360-epoch-10/special_tokens_map.json  
  inflating: checkpoint-560360-epoch-10/vocab.txt  
  inflating: checkpoint-560360-epoch-10/training_args.bin  
  inflating: checkpoint-560360-epoch-10/optimizer.pt  
  inflating: checkpoint-560360-epoch-10/scheduler.pt  
  inflating: checkpoint-560360-epoch-10/model_args.json  


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

import umap

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [6]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [7]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("checkpoint-560360-epoch-10")
model = AutoModel.from_pretrained("checkpoint-560360-epoch-10")

Some weights of the model checkpoint at checkpoint-560360-epoch-10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at checkpoint-560360-epoch-10 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.de

In [9]:
dfK = pd.read_csv('./data/Контракты 44ФЗ.csv', sep=';').drop_duplicates()
dfD = pd.read_csv('./data/Справочник пром производства.csv', sep=';').drop_duplicates()
dfP = pd.read_csv('./data/Ценовые предложения поставщиков.csv', sep=';').drop_duplicates()

In [10]:
dfK.is_contract = 1
dfP.is_contract = -1
dfD.is_contract = 0

In [11]:
dfK.shape, dfP.shape, dfD.shape

((916112, 9), (222713, 9), (105460, 9))

In [12]:
full_df = pd.concat([dfK, dfP, dfD], ignore_index=True)

In [13]:
selected = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates()
count_df = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates().groupby("okpd2_code").agg("count").reset_index()
classes = count_df.query("product_name >= 50")["okpd2_code"].to_list()

In [14]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(312, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [15]:
bert_cls = BertCLS(model, len(classes))

In [16]:
selected2 = selected[selected["okpd2_code"].isin(classes)].reset_index(drop=True)
selected2["product_characteristics"] = selected2["product_characteristics"].replace(np.nan, "")
selected2["text"] = (selected2["product_name"].str.strip() + " [SEP] " + selected2["product_characteristics"].str.strip()).str.strip().str.lower()

In [17]:
selected2["okpd2_code"].value_counts()

29.32.30.390    142367
45.20.11.519     58569
58.11.11.000     38223
58.11.19.000     29122
32.50.50.190     19832
                 ...  
33.11.11.000        50
20.30.12.150        50
29.20.23.190        50
27.51.23.130        50
30.99.10.190        50
Name: okpd2_code, Length: 1463, dtype: int64

In [18]:
selected2

Unnamed: 0,product_name,okpd2_code,product_characteristics,text
0,Драм-юнит Cet CET8997,28.23.25.000,,драм-юнит cet cet8997 [sep]
1,Лук репчатый,01.13.43.110,Дополнительные показатели (характеристики): Ук...,лук репчатый [sep] дополнительные показатели (...
2,Карандаш механический (Страна происхождения : ...,32.99.12.130,,карандаш механический (страна происхождения : ...
3,Имплантаты для остеосинтеза варианты исполнени...,32.50.50.190,,имплантаты для остеосинтеза варианты исполнени...
4,Говядина замороженная для детского питания,10.11.31.130,,говядина замороженная для детского питания [sep]
...,...,...,...,...
964704,1ПТС-2.01.05.000 Ось,29.20.30.110,Описание: 1ПТС-2.01.05.000 Ось || Описание: 1П...,1птс-2.01.05.000 ось [sep] описание: 1птс-2.01...
964705,2ПТС-6.45.01.00 Ось,29.20.30.110,Вид подвески: Рессорная || Ошиновка: Односкатн...,2птс-6.45.01.00 ось [sep] вид подвески: рессор...
964706,2ПТС-8.27.01.000 Ось колёсная,29.20.30.110,Описание: 2ПТС-8.27.01.000 Ось колёсная || Опи...,2птс-8.27.01.000 ось колёсная [sep] описание: ...
964707,2ПТС-10.27.10.000 Ось,29.20.30.110,Ось: Ось || Ось в сборе с АБС: Ось в сборе || ...,2птс-10.27.10.000 ось [sep] ось: ось || ось в ...


In [19]:
le = preprocessing.LabelEncoder()
selected2["target"] = le.fit_transform(selected2["okpd2_code"])

In [20]:
X_train, X_test = train_test_split(selected2[["text", "target"]].values,
                                   test_size=0.05,
                                   random_state=42,
                                   stratify=selected2["target"],
                                   shuffle=True)

In [21]:
# X_train, X_test = X_train[:10000], X_test[:10000]

In [22]:
selected2["text"].str.len().mean()

232.15544480252595

In [23]:
class ClassificationDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text, target = self.data[idx]
        return text, target

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    model_target = []
    for text, target in batch:
        model_input.append(text)
        model_target.append(target)

    tok = tokenizer(model_input, padding=True,
                    max_length=300, truncation=True,
                    return_tensors='pt')
    return tok, torch.tensor(model_target).long()

In [24]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn
    )
    return loader

In [25]:
batch_size = 256
train_dataset = ClassificationDataset(X_train)
train_loader = get_loader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = ClassificationDataset(X_test)
test_loader = get_loader(test_dataset, shuffle=False, batch_size=batch_size)

In [26]:
set_seed(42)

bert_cls = bert_cls.to(device)

num_epochs = 5
gradient_accumulation_steps = 1

total_steps = (len(train_loader) * num_epochs)
optimizer = optim.AdamW(bert_cls.parameters(), lr=2e-4)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-4,
                                          total_steps=total_steps,
                                          div_factor=25,
                                          pct_start=0.1)
loss_func = torch.nn.CrossEntropyLoss()

In [27]:
def train(model, loss_func, device, train_loader, optimizer, epoch, gradient_accumulation_steps, scheduler):
    model.train()
    train_loader_length = len(train_loader) - 1 # хотим знать когда эпоха закончится
    pbar = tgdm_tg(train_loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
    for batch_idx, (data, labels) in enumerate(pbar):
        data, labels = data.to(device), labels.to(device)
        
        t = time.time()
        embeddings = model(data)
        t2 = time.time() - t
        
        loss = loss_func(embeddings, labels) / gradient_accumulation_steps
        
        tot_m, used_m, free_m = map(int, os.popen('free -t -m').readlines()[-1].split()[1:])
        pbar.set_description(f"{print_gpu_utilization()} Used RAM: {used_m} Free RAM: {free_m} Loss Train: {float(loss) * gradient_accumulation_steps} Time: {t2}")
        
        loss.backward()
        
        if (batch_idx % gradient_accumulation_steps == 0) or (batch_idx == train_loader_length):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # ставим модель в рамки, помогает
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        
        del data, labels, embeddings, loss
    return model

In [28]:
def test(model, loader, device):
    y_true = []
    pred = []
    model.eval()
    with torch.no_grad():
        pbar = tgdm_tg(loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
        for batch_idx, (data, labels) in enumerate(pbar):
            data = data.to(device)
            embeddings = model(data)
            pred.extend(embeddings.argmax(-1).detach().cpu().numpy())
            y_true.extend(labels.detach().cpu().numpy())

    return y_true, pred

In [29]:
def metrics(y_true, pred):
    print("matthews_corrcoef:", matthews_corrcoef(y_true, pred))
    print("accuracy_score:", accuracy_score(y_true, pred))
    print("f1_score:", f1_score(y_true, pred, average='weighted'))

In [30]:
for x in bert_cls.parameters(): x.requires_grad = True

In [31]:
count_parameters(bert_cls)

+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|         model.embeddings.word_embeddings.weight         |  26154336  |
|       model.embeddings.position_embeddings.weight       |   638976   |
|      model.embeddings.token_type_embeddings.weight      |    624     |
|            model.embeddings.LayerNorm.weight            |    312     |
|             model.embeddings.LayerNorm.bias             |    312     |
|    model.encoder.layer.0.attention.self.query.weight    |   97344    |
|     model.encoder.layer.0.attention.self.query.bias     |    312     |
|     model.encoder.layer.0.attention.self.key.weight     |   97344    |
|      model.encoder.layer.0.attention.self.key.bias      |    312     |
|    model.encoder.layer.0.attention.self.value.weight    |   97344    |
|     model.encoder.layer.0.attention.self.value.bi

In [32]:
!mkdir models
for epoch in range(1, num_epochs + 1):
    bert_cls.train()
    
    clear_cache()
    train(bert_cls, loss_func, device, train_loader, optimizer,
          epoch, gradient_accumulation_steps, scheduler)
    PATH = f"./models/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    torch.save(bert_cls.state_dict(), PATH)
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

  0%|          | 0/3580 [00:00<?, ?it/s]

  self.message_id


  0%|          | 0/189 [00:00<?, ?it/s]

  message_id = self.message_id


matthews_corrcoef: 0.7388855110591537
accuracy_score: 0.7476988141636952
f1_score: 0.7156119025289907


  self.message_id


  0%|          | 0/3580 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/189 [00:00<?, ?it/s]

matthews_corrcoef: 0.7746143652025232
accuracy_score: 0.7822166017082677
f1_score: 0.761486496212772


  self.message_id


  0%|          | 0/3580 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/189 [00:00<?, ?it/s]

matthews_corrcoef: 0.7844099782915173
accuracy_score: 0.7916908533045858
f1_score: 0.7754321311047353


  self.message_id


  0%|          | 0/3580 [00:00<?, ?it/s]

  message_id = self.message_id


  0%|          | 0/189 [00:00<?, ?it/s]

matthews_corrcoef: 0.7889267813798816
accuracy_score: 0.7960237167260967
f1_score: 0.7810590110794327


  0%|          | 0/3580 [00:00<?, ?it/s]

  self.message_id


  0%|          | 0/189 [00:00<?, ?it/s]

  message_id = self.message_id


matthews_corrcoef: 0.7884984420277
accuracy_score: 0.7956090886474833
f1_score: 0.7817529401332467


In [33]:
!ls models

BertCLS_epoch_1.pth  BertCLS_epoch_3.pth  BertCLS_epoch_5.pth
BertCLS_epoch_2.pth  BertCLS_epoch_4.pth
