In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import logging
import os
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from simpletransformers.language_modeling import (
    LanguageModelingModel,
    LanguageModelingArgs,
)

In [3]:
!nvidia-smi

Sat Oct  1 12:53:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01   Driver Version: 450.172.01   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-PCIE-40GB      On   | 00000000:CA:00.0 Off |                    0 |
| N/A   28C    P0    40W / 250W |      0MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [10]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [11]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
device

device(type='cuda')

In [14]:
dfK = pd.read_csv('./data/Контракты 44ФЗ.csv', sep=';').drop_duplicates()
dfD = pd.read_csv('./data/Справочник пром производства.csv', sep=';').drop_duplicates()
dfP = pd.read_csv('./data/Ценовые предложения поставщиков.csv', sep=';').drop_duplicates()

In [15]:
dfK.is_contract = 1
dfP.is_contract = -1
dfD.is_contract = 0

In [16]:
dfK.shape, dfP.shape, dfD.shape

((916112, 9), (222713, 9), (105460, 9))

In [17]:
full_df = pd.concat([dfK, dfP, dfD], ignore_index=True)

In [18]:
selected = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates()
count_df = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates().groupby("okpd2_code").agg("count").reset_index()
classes = count_df.query("product_name >= 100")["okpd2_code"].to_list()

In [19]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(312, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [20]:
bert_cls = BertCLS(model, len(classes))

In [21]:
selected2 = selected[selected["okpd2_code"].isin(classes)].reset_index(drop=True)
selected2["product_characteristics"] = selected2["product_characteristics"].replace(np.nan, "")
selected2["text"] = (selected2["product_name"].str.strip() + " [SEP] " + selected2["product_characteristics"].str.strip()).str.strip()

In [22]:
selected2["okpd2_code"].value_counts()

29.32.30.390    142367
45.20.11.519     58569
58.11.11.000     38223
58.11.19.000     29122
32.50.50.190     19832
                 ...  
32.50.50.181       101
10.20.25.111       100
29.32.30.211       100
33.13.12.000       100
28.11.41.000       100
Name: okpd2_code, Length: 1005, dtype: int64

In [23]:
le = preprocessing.LabelEncoder()
selected2["target"] = le.fit_transform(selected2["okpd2_code"])

In [24]:
X_train, X_test = train_test_split(selected2[["text", "target"]].values,
                                   test_size=0.1,
                                   random_state=42,
                                   stratify=selected2["target"],
                                   shuffle=True)

In [33]:
X_train.shape

(839439, 2)

In [34]:
X_test.shape

(93271, 2)

In [40]:
pd.Series(X_train[:,0]).to_csv("data/train.txt", index=False)

In [41]:
pd.Series(X_test[:,0]).to_csv("data/test.txt", index=False)

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 10
model_args.dataset_type = "simple"

train_file = "data/train.txt"
test_file = "data/test.txt"

model = LanguageModelingModel(
    "bert", "cointegrated/rubert-tiny2", args=model_args
)

# Train the model
model.train_model(train_file, eval_file=test_file)

# Evaluate the model
result = model.eval_model(test_file)

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:simpletransformers.language_modeling.language_modeling_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/839440 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3135 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2395 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2397 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5475 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3362 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

  0%|          | 0/448285 [00:00<?, ?it/s]

INFO:simpletransformers.language_modeling.language_modeling_utils: Saving features into cached file cache_dir/bert_cached_lm_126_train.txt
INFO:simpletransformers.language_modeling.language_modeling_model: Training started


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/56036 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [None]:
!nvidia-smi

In [45]:
result

{'eval_loss': 1.0903824446277521, 'perplexity': tensor(2.9754)}