In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

In [2]:
!nvidia-smi

Sun Oct  2 05:35:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100 Graphics D...  On   | 00000000:0F:00.0 Off |                    0 |
| N/A   28C    P0    72W / 400W |   5917MiB / 81252MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  A100 Graphics D...  On   | 00000000:87:00.0 Off |                    0 |
| N/A   32C    P0    73W / 400W |   6628MiB / 81252MiB |      0%      Default |
|       

In [3]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
dfK = pd.read_csv('./data/Контракты 44ФЗ.csv', sep=';').drop_duplicates()
dfD = pd.read_csv('./data/Справочник пром производства.csv', sep=';').drop_duplicates()
dfP = pd.read_csv('./data/Ценовые предложения поставщиков.csv', sep=';').drop_duplicates()

In [5]:
dfK.is_contract = 1
dfP.is_contract = -1
dfD.is_contract = 0

In [6]:
full_df = pd.concat([dfK, dfP, dfD], ignore_index=True)

In [7]:
selected = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates()
count_df = full_df[["product_name", "okpd2_code", "product_characteristics"]].drop_duplicates().groupby("okpd2_code").agg("count").reset_index()
classes = count_df.query("product_name >= 50")["okpd2_code"].to_list()
del selected, count_df

In [8]:
full_df["okpd2_value"] = full_df["okpd2_code"].apply(lambda x: x.split(".")[0])

In [9]:
temp = full_df.drop_duplicates(subset=["product_name", "okpd2_code", "product_characteristics", "okpd2_value"])
temp = temp.reset_index(drop=True)

In [10]:
temp["product_characteristics"] = temp["product_characteristics"].replace(np.nan, "")
temp["text"] = (temp["product_name"].str.strip() + " [SEP] " + temp["product_characteristics"].str.strip()).str.strip().str.strip().str.lower()
temp["country_code"] = temp["country_code"].replace(np.nan, -100)

In [11]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(312, n_classes)

    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [12]:
bert_cls = BertCLS(model, len(classes))

In [13]:
len(classes)

1463

In [14]:
bert_cls.load_state_dict(torch.load("./BertCLS_epoch_2_1500_lower.pth", map_location=torch.device("cpu")))

<All keys matched successfully>

In [15]:
class ClassificationDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text = self.data[idx]
        return text

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    for text in batch:
        model_input.append(text)

    tok = tokenizer(model_input, padding=True,
                    max_length=300, truncation=True,
                    return_tensors='pt')
    return tok

In [16]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn
    )
    return loader

In [17]:
batch_size = 400
train_dataset = ClassificationDataset(temp["text"].values)
train_loader = get_loader(train_dataset, shuffle=False, batch_size=batch_size)

In [18]:
bert_cls = bert_cls.to(device)

In [19]:
def test(model, loader, device):
    pred = []
    model.eval()
    with torch.no_grad():
        pbar = tgdm_tg(loader, token="5258964872:AAGPTJDWI2QBOqe_5jqlNqKr-fZf_xwhcEs", chat_id="661328720")
        for batch_idx, data in enumerate(pbar):
            data = data.to(device)
            embeddings = model.model(**data).pooler_output.detach().cpu().numpy().astype(np.float16)
            pred.extend(embeddings)
    return pred

In [20]:
logits = test(bert_cls, train_loader, device)

  0%|          | 0/2521 [00:00<?, ?it/s]

In [21]:
logits[0].shape

(312,)

In [22]:
len(logits)

1008321

In [23]:
df = pd.DataFrame(logits)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008321 entries, 0 to 1008320
Columns: 312 entries, 0 to 311
dtypes: float16(312)
memory usage: 600.0 MB


In [25]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.863770,1.000000,-1.000000,-0.999512,-0.999512,-0.949707,0.998047,-0.999512,-0.853516,1.000000,...,-0.339844,-0.996582,-0.999512,0.999512,-0.987305,1.000000,0.802734,0.997070,0.997070,0.947754
1,0.052368,-1.000000,0.434814,-0.998047,0.983887,0.955566,0.981934,0.747559,-0.188477,0.644043,...,0.998535,-0.859863,-0.969727,0.996094,-0.619629,0.961426,-0.988770,0.632812,0.984375,-0.997070
2,0.999512,0.999512,-0.999023,-0.981445,-0.144897,-0.363281,-0.887695,-0.557617,0.886230,0.998535,...,0.995605,-1.000000,-0.997070,0.999023,-0.994629,-0.998535,0.976562,0.999512,-0.989258,-0.865723
3,0.266357,0.999023,0.980469,-0.959961,-0.708496,-0.972656,0.999512,-0.970215,-0.596191,-0.123657,...,-0.999512,-0.434570,-0.095520,0.992676,0.726074,1.000000,-0.989258,-0.473633,0.999512,-0.868652
4,0.948242,-1.000000,-0.439697,0.971191,-0.792969,-0.812500,-0.991699,0.999512,1.000000,0.872070,...,0.662598,0.998047,-0.999023,0.996582,0.996582,0.995117,-0.998535,-0.997559,-0.899902,0.996094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008316,-0.990234,0.969727,-0.952637,0.762695,-1.000000,-0.969727,-0.906738,-0.973145,0.945312,0.486084,...,-0.947754,-0.934570,0.986816,-0.914062,0.999023,-0.999512,0.943848,0.999512,-0.955566,1.000000
1008317,-0.991699,0.870605,-0.982910,0.547363,-1.000000,-0.972656,-0.789551,-0.933594,0.883301,0.668457,...,-0.207031,-0.779785,0.983887,-0.901367,0.997559,-0.999512,0.793457,0.999023,-0.954102,1.000000
1008318,-0.993652,0.935059,-0.970703,0.507324,-1.000000,-0.936523,-0.557617,-0.922363,0.880371,0.833496,...,-0.979004,-0.590820,0.966797,-0.908691,0.998047,-0.998047,0.833008,1.000000,-0.937988,1.000000
1008319,-0.969238,0.950195,-0.937012,0.498047,-1.000000,-0.979004,-0.649902,-0.986328,0.773926,0.842773,...,-0.806641,-0.842773,0.975098,-0.953613,0.999023,-0.999023,0.951660,0.999023,-0.929688,1.000000


In [26]:
temp

Unnamed: 0,product_name,price,product_vat_rate,product_msr,product_characteristics,okpd2_code,okpd2_name,inn,country_code,okpd2_value,text
0,Драм-юнит Cet CET8997,7605.00,0%,Штука,,28.23.25.000,Части и принадлежности прочих офисных машин,6a2325da490021b46bb00834ab4a560a,156,28,драм-юнит cet cet8997 [sep]
1,Лук репчатый,48.75,Без НДС,Килограмм,Дополнительные показатели (характеристики): Ук...,01.13.43.110,Лук репчатый,c8f328b188e78a1673db8cc2db1b6828,643,01,лук репчатый [sep] дополнительные показатели (...
2,Карандаш механический (Страна происхождения : ...,8.00,20%,Штука,,32.99.12.130,Карандаши механические,9d785973706feeff180ee96a247135e6,156,32,карандаш механический (страна происхождения : ...
3,Имплантаты для остеосинтеза варианты исполнени...,5200.00,Без НДС,Штука,,32.50.50.190,"Изделия медицинские, в том числе хирургические...",5a35794bbd7c29420b25479364c26c7b,616,32,имплантаты для остеосинтеза варианты исполнени...
4,Говядина замороженная для детского питания,490.59,Без НДС,Килограмм,,10.11.31.130,Говядина и телятина замороженные для детского ...,91c23daf46d288344828eac627074bf3,643,10,говядина замороженная для детского питания [sep]
...,...,...,...,...,...,...,...,...,...,...,...
1008316,1ПТС-2.01.05.000 Ось,,Без НДС,Штука,Описание: 1ПТС-2.01.05.000 Ось || Описание: 1П...,29.20.30.110,Комплектующие (запасные части) прицепов и полу...,f1bdf5ed1d7ad7ede4e3809bd35644b0,-100,29,1птс-2.01.05.000 ось [sep] описание: 1птс-2.01...
1008317,2ПТС-6.45.01.00 Ось,,Без НДС,Штука,Вид подвески: Рессорная || Ошиновка: Односкатн...,29.20.30.110,Комплектующие (запасные части) прицепов и полу...,f1bdf5ed1d7ad7ede4e3809bd35644b0,-100,29,2птс-6.45.01.00 ось [sep] вид подвески: рессор...
1008318,2ПТС-8.27.01.000 Ось колёсная,,Без НДС,Штука,Описание: 2ПТС-8.27.01.000 Ось колёсная || Опи...,29.20.30.110,Комплектующие (запасные части) прицепов и полу...,f1bdf5ed1d7ad7ede4e3809bd35644b0,-100,29,2птс-8.27.01.000 ось колёсная [sep] описание: ...
1008319,2ПТС-10.27.10.000 Ось,,Без НДС,Штука,Ось: Ось || Ось в сборе с АБС: Ось в сборе || ...,29.20.30.110,Комплектующие (запасные части) прицепов и полу...,f1bdf5ed1d7ad7ede4e3809bd35644b0,-100,29,2птс-10.27.10.000 ось [sep] ось: ось || ось в ...


In [27]:
df.columns = [str(i) for i in df.columns]

In [28]:
temp["country_code"] = temp["country_code"].astype(str)

In [29]:
final_df = pd.concat([df, temp], axis=1)

In [30]:
final_df["country_code"].dtype

dtype('O')

In [31]:
final_df.to_feather("bert-tiny-1500-final-df-lower.feather")

In [None]:
df.to_feather("bert-tiny-1500-embedings.feather")

In [None]:
temp.to_feather("bert-tiny-1500-info.feather")

In [None]:
final_df

In [None]:
import faiss

In [None]:
embeddings = df.values.astype(np.float32)

In [None]:
embeddings.shape

In [None]:
embeddings = np.ascontiguousarray(embeddings)

In [None]:
d = 312 # длина эмбеддинга
index = faiss.IndexFlatIP(d)
index.add(embeddings) # сами эмбеддинги, нампай массив shape = (n_samples, d)

In [None]:
example_txt = "Ноутбук HP"
example = tokenizer(example_txt, padding=True,
                    max_length=300, truncation=True,
                    return_tensors='pt')

In [None]:
with torch.no_grad():
    bert_cls.eval()
    xq = bert_cls.model(**example.to(device)).pooler_output.detach().cpu().numpy()

In [None]:
k = 50
D, I = index.search(xq, k) # xq shape = (1, d)

In [None]:
from pyjarowinkler import distance

def string_dist(str1, str2):
    return distance.get_jaro_distance(str1, str2,
                                      winkler=False,
                                      winkler_ajustment=False,
                                      scaling=0.2)

In [None]:
abc = temp.reset_index(drop=True).iloc[I[0]]

In [None]:
string_dist(example_txt, 'asda')

In [None]:
abc["dist"] = abc["product_name"].apply(lambda x: string_dist(example_txt, x))

In [None]:
abc

In [None]:
abc.sort_values(by="dist", ascending=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import os
import cv2
from PIL import Image
from tqdm.notebook import tqdm
from tqdm.contrib.telegram import tqdm as tgdm_tg
from prettytable import PrettyTable
import random
import numpy as np
import gc
import scipy.io as sio
import pandas as pd
import time

from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef

from torchvision import datasets, transforms as T

from transformers import AutoTokenizer, AutoModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

from pyjarowinkler import distance
import faiss

In [None]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

In [None]:
class BertCLS(nn.Module):
    def __init__(self, model, n_classes):
        super(BertCLS, self).__init__()
        self.model = model
        self.fc = nn.Linear(312, n_classes)
    
    def forward(self, batch):
        return self.fc(self.model(**batch).pooler_output)

In [None]:
bert_cls = BertCLS(model, 1005)

In [None]:
bert_cls.load_state_dict(torch.load("./BertCLS_epoch_3_1000.pth", map_location=torch.device("cpu")))

In [None]:
bert_cls = bert_cls.to(device)

In [None]:
def get_embeddings(bert_cls, tokenizer, text):
    tokens = tokenizer(text, padding=True,
                       max_length=300, truncation=True,
                       return_tensors='pt')
    tokens = tokens.to(bert_cls.model.device)
    return bert_cls.model(**tokens).pooler_output.detach().cpu().numpy()

In [None]:
def string_dist(str1, str2):
    return distance.get_jaro_distance(str1, str2,
                                      winkler=True,
                                      winkler_ajustment=True,
                                      scaling=0.2)

In [None]:
final_df = pd.read_feather("bert-tiny-1000-final-df.feather")
search_df = final_df[final_df.columns[312:]].reset_index(drop=True)

In [None]:
embeddings = final_df[final_df.columns[:312]].values.astype(np.float32)
embeddings = np.ascontiguousarray(embeddings)
faiss.normalize_L2(embeddings)

In [None]:
d = 312 # длина эмбеддинга
index = faiss.IndexFlatIP(d)
index.add(embeddings) # сами эмбеддинги, нампай массив shape = (n_samples, d)

In [None]:
text_query = "стул деревянный"
xq = get_embeddings(bert_cls, tokenizer, text_query)
faiss.normalize_L2(xq)

In [None]:
k = 100
D, I = index.search(xq, k) # xq shape = (1, d)

In [None]:
faiss_results = search_df.iloc[I[0]].reset_index(drop=True)
faiss_results["string_dist"] = faiss_results["product_name"].apply(lambda x: string_dist(x, text_query))
faiss_results

In [None]:
faiss_results.sort_values(by="string_dist", ascending=False).head(10)

In [None]:
faiss_results.sort_values(by="string_dist", ascending=False).tail(10)