In [17]:
%load_ext autoreload
%autoreload 2

In [147]:
import ast
import logging
import os
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from simpletransformers.language_modeling import (
    LanguageModelingModel,
    LanguageModelingArgs,
)

In [34]:
tqdm.pandas()

In [20]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [168]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic=True

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return f"GPU memory occupied: {info.used//1024**2} MB."

In [169]:
clear_cache()

In [170]:
def count_parameters(model):
    """
    model: torch.nn.Module
    
    Используется для подробного вывода
    параметров модели
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name, parameter.numel())
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")

In [175]:
df = pd.read_feather("data/goods.feather")

In [176]:
def standardization_characteristics(characteristics: str):
    if characteristics is None:
        return ""
    characteristics = ast.literal_eval(characteristics.lower())
    data = []
    for characteristic in characteristics:
        if "value" in characteristic:
            if len(characteristic["value"].split()) <= 3:
                if ("value" in characteristic) and ("unit" in characteristic):
                    data.append(f"{characteristic['value']} {characteristic['unit']}")
                elif characteristic["value"] in ["да", "нет"]:
                    if len(characteristic["name"].split()) <= 3:
                        data.append(characteristic['name'])
                else:
                    data.append(characteristic["value"])
    return ", ".join(list({i.strip() for i in data}))

In [131]:
df = df.drop(df.index[[126302, 170141]])

In [132]:
df['Характеристики'] = df['Характеристики'].progress_apply(lambda s: standardization_characteristics(s))

100%|██████████| 356573/356573 [00:37<00:00, 9535.18it/s] 


In [177]:
df["text"] = df["Название СТЕ"].str.lower().str.strip() 

In [178]:
df["cat_count"] = df.groupby('Категория')['Категория'].transform('count')

In [180]:
df = df[df["cat_count"] > 2]

In [181]:
df["Категория"].value_counts()

Расходные материалы и комплектующие для лазерных принтеров и МФУ                                  9266
Учебники печатные общеобразовательного назначения                                                 6912
Одежда специальная для защиты от общих производственных загрязнений и механических воздействий    3510
Предметы внутреннего интерьера                                                                    2456
Фурнитура для сантехнического оборудования                                                        2046
                                                                                                  ... 
Наборы для катетеризации центральных вен по "сельдингеру"                                            3
Натрий тетраборнокислый 10-водный (реактив)                                                          3
Техническое обслуживание и содержание объектов наружного освещения                                   3
Битум строительный                                                       

In [182]:
le = preprocessing.LabelEncoder()
df["Target"] = le.fit_transform(df["Категория"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [183]:
X_train, X_test = train_test_split(df[["text", "Target"]].values,
                                   test_size=0.1,
                                   random_state=42,
                                   stratify=df["Target"],
                                   shuffle=True)

In [184]:
X_train.shape

(318585, 2)

In [185]:
X_test.shape

(35399, 2)

In [186]:
pd.Series(X_train[:,0]).to_csv("data/train.txt", index=False)

In [187]:
pd.Series(X_test[:,0]).to_csv("data/test.txt", index=False)

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 3
model_args.max_seq_length = 200
model_args.dataset_type = "simple"
model_args.batch_size = 32

train_file = "data/train.txt"
test_file = "data/test.txt"

model = LanguageModelingModel(
    "bert", "cointegrated/LaBSE-en-ru", args=model_args
)

# Train the model
model.train_model(train_file, eval_file=test_file)

# Evaluate the model
result = model.eval_model(test_file)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:simpletransformers.language_modeling.language_modeling_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/318589 [00:00<?, ?it/s]

  0%|          | 0/29980 [00:00<?, ?it/s]

INFO:simpletransformers.language_modeling.language_modeling_utils: Saving features into cached file cache_dir/bert_cached_lm_198_train.txt
INFO:simpletransformers.language_modeling.language_modeling_model: Training started


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3748 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/3748 [00:00<?, ?it/s]

In [190]:
result

{'eval_loss': 2.3518733615212017, 'perplexity': tensor(10.5052)}

#### Test MLM

In [191]:
MODEL_NAME = "outputs"

In [199]:
model = LanguageModelingModel(
    "bert", MODEL_NAME
)

In [200]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model=MODEL_NAME, framework="pt")
unmasker("Хочу купить волебольный [MASK]")

[{'score': 0.3463304340839386,
  'token': 22933,
  'token_str': 'мяч',
  'sequence': 'Хочу купить волебольный мяч'},
 {'score': 0.15694421529769897,
  'token': 12778,
  'token_str': 'набор',
  'sequence': 'Хочу купить волебольный набор'},
 {'score': 0.020530350506305695,
  'token': 15021,
  'token_str': 'комплект',
  'sequence': 'Хочу купить волебольный комплект'},
 {'score': 0.01725109852850437,
  'token': 7764,
  'token_str': 'знак',
  'sequence': 'Хочу купить волебольный знак'},
 {'score': 0.015098798088729382,
  'token': 42450,
  'token_str': 'теннис',
  'sequence': 'Хочу купить волебольный теннис'}]