## Л

In [1]:
!pwd

/home/jovyan/minerals/gerasimov/exp_fold_training/temp


In [2]:
!nvidia-smi

Sat Oct 22 01:40:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100 Graphics D...  On   | 00000000:B7:00.0 Off |                    0 |
| N/A   39C    P0    68W / 400W |      0MiB / 81252MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# !gdown --fuzzy --folder https://drive.google.com/drive/folders/1XwjWRA6fcqfiTuxHhr88KoSjKISyI4sp?usp=sharing

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [6]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [7]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
contracts = pd.read_feather("contracts.feather")
goods = pd.read_feather("goods.feather")

In [10]:
goods["Название СТЕ"] = goods["Название СТЕ"].str.lower().str.strip()

In [11]:
def standardization_characteristics(characteristics: str):
    try:
        characteristics = eval(characteristics.lower())
        data = []
        for characteristic in characteristics:
            if "value" in characteristic:
                if len(characteristic["value"].split()) <= 3:
                    if ("value" in characteristic) and ("unit" in characteristic):
                        data.append(f"{characteristic['value']} {characteristic['unit']}")
                    elif characteristic["value"] in ["да", "нет"]:
                        if len(characteristic["name"].split()) <= 3:
                            data.append(characteristic['name'])
                    else:
                        data.append(characteristic["value"])
        data = [i.strip() for i in data]
        data = sorted(set(data))
        return ", ".join(data)
    except:
        return ""

In [12]:
goods["Характеристики"] = goods["Характеристики"].apply(standardization_characteristics)

In [13]:
selected = goods[["Название СТЕ", "Код КПГЗ", "Характеристики"]].drop_duplicates()
count_df = goods[["Название СТЕ", "Код КПГЗ", "Характеристики"]].drop_duplicates().groupby("Код КПГЗ").agg("count").reset_index()
classes = count_df.loc[count_df["Название СТЕ"] >= 30, "Код КПГЗ"].to_list()

In [14]:
len(classes)

1420

In [15]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(768, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [16]:
bert_cls = BertCLS(model, len(classes))

In [17]:
selected2 = selected[selected["Код КПГЗ"].isin(classes)].reset_index(drop=True)
selected2["Характеристики"] = selected2["Характеристики"].replace(np.nan, "")
selected2["text"] = selected2["Название СТЕ"].str.strip().str.lower() + " [SEP] " + selected2["Характеристики"].str.strip().str.lower()

In [18]:
selected2["Код КПГЗ"].value_counts()

01.13.13.01.01       9262
01.14.01.01          6906
01.11.03.01.01.99    4646
01.09.08.01.99       3886
01.11.02.02.99       3829
                     ... 
01.02.11.03.01         30
01.13.10.02            30
02.06.01.01.03         30
01.13.01.01.05         30
01.02.09.03.07         30
Name: Код КПГЗ, Length: 1420, dtype: int64

In [19]:
selected2

Unnamed: 0,Название СТЕ,Код КПГЗ,Характеристики,text
0,мяч футбольный mikasa regateador5-g,01.08.01.13.01,"5 усл. ед, mikasa, regateador5-r, белый, любит...",мяч футбольный mikasa regateador5-g [SEP] 5 ус...
1,мяч волейбольный gala pro-line 10 fivb,01.08.01.14.03.01,"5, gala, pro-line 10 fivb, белый, профессионал...",мяч волейбольный gala pro-line 10 fivb [SEP] 5...
2,мяч волейбольный mikasa mva380k-obl,01.08.01.14.03.01,"5, mikasa, mva380k, оранжевый, синт, тренровочный","мяч волейбольный mikasa mva380k-obl [SEP] 5, m..."
3,мяч волейбольный wilson super soft play,01.08.01.14.03.01,"5, super soft play, wilson, белый, любительски...",мяч волейбольный wilson super soft play [SEP] ...
4,gutrend комплект расходных материалов для fun ...,01.20.10.99,"120 г, 170 мм, 220 мм, 50 мм, 6 шт, hepa фильт...",gutrend комплект расходных материалов для fun ...
...,...,...,...,...
328610,шина 23x10.50-12 107a8 starco as loader tl,01.09.08.10.05.02,"10 кг, 12 дюйм, 580 мм, 8 pr, starco, бескамер...",шина 23x10.50-12 107a8 starco as loader tl [SE...
328611,кнопка включения рабочего освещения для минипо...,01.09.08.03,"12 в, 650, 90 г, avant, включение рабочего осв...",кнопка включения рабочего освещения для минипо...
328612,"обои флизелиновые под покраску nc antivandal, ...",01.11.03.11.08,"106 см, 25 м, 350 г, 4 шт, 4010-16, antivandal...","обои флизелиновые под покраску nc antivandal, ..."
328613,мусорное ведро kimberly-clark aquarius белое п...,01.20.03.03.06,"2 шт, 29.00000 см, 4,76 кг, 43.00000 см, 57.00...",мусорное ведро kimberly-clark aquarius белое п...


In [20]:
le = preprocessing.LabelEncoder()
selected2["target"] = le.fit_transform(selected2["Код КПГЗ"])

In [21]:
X_train, X_test = train_test_split(selected2[["text", "target"]].values,
                                   test_size=0.05,
                                   random_state=42,
                                   stratify=selected2["target"],
                                   shuffle=True)

In [22]:
# X_train, X_test = X_train[:10000], X_test[:10000]

In [23]:
selected2["text"].str.len().mean()

155.86858786117494

In [24]:
class ClassificationDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text, target = self.data[idx]
        return text, target

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    model_target = []
    for text, target in batch:
        model_input.append(text)
        model_target.append(target)

    tok = tokenizer(model_input, padding=True,
                    max_length=200, truncation=True,
                    return_tensors='pt')
    return tok, torch.tensor(model_target).long()

In [25]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn
    )
    return loader

In [26]:
batch_size = 200
train_dataset = ClassificationDataset(X_train)
train_loader = get_loader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = ClassificationDataset(X_test)
test_loader = get_loader(test_dataset, shuffle=False, batch_size=batch_size)

In [27]:
set_seed(42)

bert_cls = bert_cls.to(device)

num_epochs = 5
gradient_accumulation_steps = 1

total_steps = (len(train_loader) * num_epochs)
optimizer = optim.AdamW(bert_cls.parameters(), lr=2e-4)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-4,
                                          total_steps=total_steps,
                                          div_factor=25,
                                          pct_start=0.1)
loss_func = torch.nn.CrossEntropyLoss()

In [28]:
def train(model, loss_func, device, train_loader, optimizer, epoch, gradient_accumulation_steps, scheduler):
    model.train()
    train_loader_length = len(train_loader) - 1 # хотим знать когда эпоха закончится
    pbar = tgdm_tg(train_loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
    for batch_idx, (data, labels) in enumerate(pbar):
        data, labels = data.to(device), labels.to(device)
        
        t = time.time()
        embeddings = model(data)
        t2 = time.time() - t
        
        loss = loss_func(embeddings, labels) / gradient_accumulation_steps
        
        tot_m, used_m, free_m = map(int, os.popen('free -t -m').readlines()[-1].split()[1:])
        pbar.set_description(f"{print_gpu_utilization()} Used RAM: {used_m} Free RAM: {free_m} Loss Train: {float(loss) * gradient_accumulation_steps} Time: {t2}")
        
        loss.backward()
        
        if (batch_idx % gradient_accumulation_steps == 0) or (batch_idx == train_loader_length):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # ставим модель в рамки, помогает
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        
        del data, labels, embeddings, loss
    return model

In [29]:
def test(model, loader, device):
    y_true = []
    pred = []
    model.eval()
    with torch.no_grad():
        pbar = tgdm_tg(loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
    
        for batch_idx, (data, labels) in enumerate(pbar):
            data = data.to(device)
            embeddings = model(data)
            pred.extend(embeddings.argmax(-1).detach().cpu().numpy())
            y_true.extend(labels.detach().cpu().numpy())

    return y_true, pred

In [30]:
def metrics(y_true, pred):
    print("matthews_corrcoef:", matthews_corrcoef(y_true, pred))
    print("accuracy_score:", accuracy_score(y_true, pred))
    print("f1_score:", f1_score(y_true, pred, average='weighted'))

In [31]:
for x in bert_cls.parameters(): x.requires_grad = True

In [32]:
count_parameters(bert_cls)

+----------------------------------------------------------+------------+
|                         Modules                          | Parameters |
+----------------------------------------------------------+------------+
|         model.embeddings.word_embeddings.weight          |  42303744  |
|       model.embeddings.position_embeddings.weight        |   393216   |
|      model.embeddings.token_type_embeddings.weight       |    1536    |
|            model.embeddings.LayerNorm.weight             |    768     |
|             model.embeddings.LayerNorm.bias              |    768     |
|    model.encoder.layer.0.attention.self.query.weight     |   589824   |
|     model.encoder.layer.0.attention.self.query.bias      |    768     |
|     model.encoder.layer.0.attention.self.key.weight      |   589824   |
|      model.encoder.layer.0.attention.self.key.bias       |    768     |
|    model.encoder.layer.0.attention.self.value.weight     |   589824   |
|     model.encoder.layer.0.attention.

In [None]:
!mkdir models
for epoch in range(1, num_epochs + 1):
    bert_cls.train()
    
    clear_cache()
    train(bert_cls, loss_func, device, train_loader, optimizer,
          epoch, gradient_accumulation_steps, scheduler)
    PATH = f"./models/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    torch.save(bert_cls.state_dict(), PATH)
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

mkdir: cannot create directory ‘models’: File exists


  0%|          | 0/1561 [00:00<?, ?it/s]

  self.message_id


  0%|          | 0/83 [00:00<?, ?it/s]

  message_id = self.message_id


matthews_corrcoef: 0.8244674203353249
accuracy_score: 0.8250867263100238
f1_score: 0.8100397013298607


  self.message_id


  0%|          | 0/1561 [00:00<?, ?it/s]

  message_id = self.message_id


In [None]:
!ls models

In [36]:
!mkdir models
for epoch in range(1, num_epochs + 1):
    PATH = f"./models/{bert_cls.__class__.__name__}_epoch_{epoch}.pth"
    bert_cls.load_state_dict(torch.load(PATH))
    
    clear_cache()
    y_true, pred = test(bert_cls, test_loader, device)
    metrics(y_true, pred)

mkdir: cannot create directory ‘models’: File exists


  0%|          | 0/83 [00:00<?, ?it/s]

matthews_corrcoef: 0.8244674203353249
accuracy_score: 0.8250867263100238
f1_score: 0.8100397013298607


  0%|          | 0/83 [00:00<?, ?it/s]

matthews_corrcoef: 0.8728934869441873
accuracy_score: 0.8733491570811271
f1_score: 0.8675170804020862


  0%|          | 0/83 [00:00<?, ?it/s]

matthews_corrcoef: 0.8988493908080971
accuracy_score: 0.8992148986671535
f1_score: 0.8957247643818462


  0%|          | 0/83 [00:00<?, ?it/s]

matthews_corrcoef: 0.9139343657081539
accuracy_score: 0.9142474590712677
f1_score: 0.9119584445917452


  0%|          | 0/83 [00:00<?, ?it/s]

matthews_corrcoef: 0.9183322911798375
accuracy_score: 0.9186294199987828
f1_score: 0.9163812810262328
