In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import faiss

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
   

In [5]:
df_books = pd.read_csv('data_final.csv')

In [6]:
df_books.drop_duplicates(inplace=True)
df_books.reset_index(drop=True, inplace=True)

# Максимальная длина текста
MAX_LEN = 300

In [2]:
# Функция для встраивания текста с использованием BERT
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [9]:
#Убираем книги, с описанием меньше 50 слов
df_books = df_books[df_books['annotation'].str.split().str.len() >= 50]
df_books['annotation'] = df_books['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())
df_books.reset_index(drop=True, inplace=True)
vector_text = np.array([embed_bert_cls(i, model, tokenizer) for i in df_books['annotation']])
vector_shape = vector_text.shape[1]

In [7]:
# Текст запроса
user_text = 'антиутопия про людей которые пошли против системы'
user_text_pred = embed_bert_cls(user_text, model, tokenizer)
user_text_pred = user_text_pred.reshape(1, -1)

In [34]:
# Встраивание запроса и поиск ближайших векторов с использованием Faiss
index = faiss.IndexFlatIP(vector_shape)
index.add(vector_text)
D, I = index.search(user_text_pred, 10)
for book_ind, pred in zip(I[0], D[0]):
    print(df_books['title'][book_ind], pred)

In [24]:
# Записываем массив векторов в текстовом виде
with open('vectors.txt', 'w') as file:
    for vector in vector_text:
        vector_str = ' '.join(map(str, vector))
        file.write(vector_str + '\n')