In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df = pd.read_json('../data/agora_hack_products.json')

In [None]:
df

Unnamed: 0,product_id,name,props,is_reference,reference_id
0,0007302f2fe1d54d,Классическая сплит-система ROYAL CLIMA PANDORA...,"[Класс энергоэффективности A, Мощность конди...",False,f497219eb0077f84
1,000740b6c1cc763e,Смартфон Xiaomi Redmi Note 10S NFC 6/128 ГБ RU...,"[Экран 6.43"" (2400x1080) AMOLED 60 Гц, 4 камер...",True,
2,0039af5efceac4ab,Холодильник Бирюса 118,[Мощность замораживания 4 кг/сутки],False,28085e941cde1639
3,004f2158acb8165c,ASUS TUF-GTX1660S-O6G-GAMING Видеокарта,"[Объем видеопамяти 6144 МБ, Частота памяти 1...",False,9afe55bb4bf1e8a8
4,005cddb29e1677ec,"Кофемашина Saeco Lirika One Touch Cappuccino, ...","[Приготовление капучино автоматическое, Матер...",False,1f21918ceb5d345c
...,...,...,...,...,...
3246,ff5da4be6fa60c4b,"15.6"" Ноутбук Lenovo IdeaPad Gaming 315IHU6 19...","[Видеокарта NVIDIA GeForce RTX 3050 4 ГБ, Врем...",True,
3247,ff66532467a02652,Моющий робот-пылесос для дома и квартиры Xiaom...,"[Объем контейнера для воды 0.27 л, ШхГхВ 35.30...",False,d4ebc4a26700d5e0
3248,ff75ade409f4da7e,"Huawei Умный браслет Band 7, графитово-черный","[Модификация GPS, Материал корпуса пластик]",False,177ccb3b84125efa
3249,ffb770de0c2feafc,Видеокарта Gigabyte GV-R675XTGAMING OC-12GD 12...,[Комплектация Retail],False,a07d5538ebec8e36


In [None]:
def merge_name_and_properties(df):
    dataframe = df.copy()
    merged = []
    for item,row in df.iterrows():
        merged.append("Название: "+ str(row['name'] + '; Характеристики товара:' + ', '.join(row['props']).replace("\\t"," ").lower()))
    dataframe['data_string'] = merged
    return dataframe



In [None]:
df_new = merge_name_and_properties(df)

In [None]:
df_new.iloc[:,-1:]

Unnamed: 0,data_string
0,Название: Классическая сплит-система ROYAL CLI...
1,Название: Смартфон Xiaomi Redmi Note 10S NFC 6...
2,Название: Холодильник Бирюса 118; Характеристи...
3,Название: ASUS TUF-GTX1660S-O6G-GAMING Видеока...
4,Название: Кофемашина Saeco Lirika One Touch Ca...
...,...
3246,"Название: 15.6"" Ноутбук Lenovo IdeaPad Gaming ..."
3247,Название: Моющий робот-пылесос для дома и квар...
3248,"Название: Huawei Умный браслет Band 7, графито..."
3249,Название: Видеокарта Gigabyte GV-R675XTGAMING ...


In [None]:
references = df[df['is_reference']==True].copy()

In [None]:
unref = df[~df['is_reference']].copy()
unref_nonunique = unref#[unref.duplicated(subset=['reference_id'],keep=False)]
unref_nonunique = unref_nonunique.drop(columns = ['is_reference','product_id'],inplace = False)

In [None]:
embs = []
for i,row in tqdm(unref_nonunique.iterrows()):
    embs.append([embed_bert_cls(row['name'], model, tokenizer),embed_bert_cls(str(row['props']).lower(), model, tokenizer)])
embeddings_query = np.array(embs).reshape(len(embs),-1)


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [None]:
embeddings_ref = []
for i,row in tqdm(references.iterrows()):
    embeddings_ref.append([embed_bert_cls(row['name'], model, tokenizer),embed_bert_cls(str(row['props']).lower(), model, tokenizer)])
embeddings_ref = np.array(embeddings_ref)
embeddings_ref = embeddings_ref.reshape(len(embeddings_ref),-1)



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [None]:
embeddings_query.shape

(2780, 624)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embs, labels,stratify = labels, test_size=0.25)

In [None]:
import scann

In [None]:
dataset = embeddings_ref
queries = embeddings_query

In [None]:
normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
normalized_queries = queries / np.linalg.norm(queries, axis=1)[:, np.newaxis]


searcher = scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product").tree(
    num_leaves=400, num_leaves_to_search=400, training_sample_size=250000).score_ah(
    2).reorder(100).build()

In [None]:

neighbors, distances = searcher.search_batched(normalized_queries)

In [None]:
nei

In [None]:
nearest = neighbors[:,0]
proposals = []
for item in nearest:
    proposals.append(references.iloc[item,:]['product_id'])

In [None]:
np.sum(1.0*(proposals==unref['reference_id']))/2780

0.7629496402877698

In [None]:
references.iloc[263,:]['props']

['Потребляемая мощность 1500 Вт',
 'Максимальная температура воды 75 °С',
 "Установка вертикальная на стену с нижней подводкой (½')",
 'Управление механическое',
 'Системы защиты предохранительный клапан',
 'Индикация нагрева',
 'Объем  бака 80 л',
 'Полезная   мощность 1.5 кВт',
 'Размеры  (ШxВxГ) 450x758x480 мм',
 'Покрытие  бака титановая эмаль']

In [None]:
unref_nonunique.iloc[-1]

name                             Водонагреватель edisson er 100 v
props           [Максимальная температура воды 75 °С, Покрытие...
reference_id                                     7810daae8a7e7fba
data_string     Название: Водонагреватель edisson er 100 v; Ха...
Name: 3250, dtype: object

In [None]:
str(references[references['product_id']=='7810daae8a7e7fba']['name'].values)

"['Накопительный электрический водонагреватель Edisson ER 100V']"

In [None]:
fails

Unnamed: 0,name,props,reference_id
10,Видеокарта MSI PCI-E GeForce GTX 1050 Ti 4GT O...,"[Комплектация Retail, Область применения игров...",31862a08efb59198
20,Xiaomi Mi Robot Vacuum Mop 2 Pro White [BHR504...,"[Объем контейнера для пыли 0.45 л, Работа от...",516c4c0cca619ea4
23,Кофемашина Beko CEG5331X (нержавеющая сталь),"[Приготовление капучино автоматическое, Тип ...",698c2d3015a71ee7
31,Видеокарта Gigabyte PCI-E 4.0 GV-N3080GAMING O...,"[Шина обмена с памятью 384 бит, Область примен...",9158a4c6707ede74
38,"ASUS Монитор 24"" ASUS VA24DQLB (90LM0541-B01370)","[Тип матрицы IPS, Экран 1920x1080 (16:9), ...",be85c7f8554f16c0
...,...,...,...
3228,Накопительный электрический водонагреватель ar...,"[Управление механическое, Системы защиты от пе...",f3823ddbc3398737
3235,50' Samsung UE50AU7100UXRU,"[Мощность звука 20 Вт, Платформа Smart TV Tize...",d75d646df7e9e2bb
3248,"Huawei Умный браслет Band 7, графитово-черный","[Модификация GPS, Материал корпуса пластик]",177ccb3b84125efa
3249,Видеокарта Gigabyte GV-R675XTGAMING OC-12GD 12...,[Комплектация Retail],a07d5538ebec8e36


In [None]:
fails = unref_nonunique[proposals!=unref['reference_id']].copy()

In [None]:
##########Final code

In [None]:
##########
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import scann

import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def merge_name_and_properties(df):
    dataframe = df.copy()
    merged = []
    for item,row in df.iterrows():
        merged.append("Название: "+ str(row['name'] + '; Характеристики товара:' + ', '.join(row['props']).replace("\\t"," ").lower()))
    dataframe['data_string'] = merged
    return dataframe

def get_candidates(embedding_queries,embedding_dataset,references):
    dataset = embeddings_dataset
    queries = embeddings_queries
    normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
    normalized_queries = queries / np.linalg.norm(queries, axis=1)[:, np.newaxis]



    searcher = scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product").tree(
    num_leaves=400, num_leaves_to_search=400, training_sample_size=250000).score_ah(
    2).reorder(100).build()
    neighbors, distances = searcher.search_batched(normalized_queries)
    nearest = neighbors[:,0]
    proposals = []
    for item in nearest:
        proposals.append(references.iloc[item,:]['product_id'])
    return proposals

def create_embeddings(df):
    embeddings_ref = []
    for i,row in tqdm(df.iterrows()):
        embeddings_ref.append([embed_bert_cls(row['name'], model, tokenizer),embed_bert_cls(str(row['props']).lower(), model, tokenizer)])
    embeddings_ref = np.array(embeddings_ref)
    embeddings_ref = embeddings_ref.reshape(len(embeddings_ref),-1)
    return embeddings_ref







Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
