In [2]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers

csv_path = '../../../resources/union_final.csv'
es = Elasticsearch('http://87.242.93.110:9200', basic_auth=('elastic', 'T86zNAvyHwbAr4S'))
video_index = 'video-index'
batch_size = 100

data: pd.DataFrame = pd.read_csv(csv_path, low_memory=False)



In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

print(torch.backends.mps.is_available())
device = torch.device("mps")


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = model.to(device)


def calc_vector(text):
    #Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=10000, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings[0].cpu().detach().numpy()


len(calc_vector('Привет'))

True




1024

In [None]:
def process_data(df: pd.DataFrame):
    videos = []
    for index, row in df.iterrows():
        video = {
            'index': row['index'],
            'description_ru': row['description_ru'],
            'description_ru_vector': calc_vector(row['description_ru']),
            'link': row['link'],
            'summary': str(row['short_description_ru'] or None)
        }

        tags = str(row['tags'])
        if tags != 'nan':
            video['tags'] = tags
            video['tags_vector'] = calc_vector(tags)

        voice = str(row['text'])
        if voice != 'nan':
            video['voice'] = voice
            video['voice_vector'] = calc_vector(voice)

        videos.append(video)
    helpers.bulk(es, videos, index=video_index)


from concurrent.futures import ThreadPoolExecutor
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
executor = ThreadPoolExecutor(max_workers=24)

data: pd.DataFrame = pd.read_csv(csv_path, low_memory=False)
# data = data.loc[0:100]

# process_data(data)

list_of_dfs = [data.loc[i:i + batch_size - 1, :] for i in range(0, len(data), batch_size)]

futures = []
for i in range(0, len(list_of_dfs)):
    future = executor.submit(process_data, list_of_dfs[i])
    futures.append(future)

for future in futures:
    future.result()

executor.shutdown()

In [15]:
text = 'тачки'
text_vector = calc_vector(text)

multi_query = {
    "knn": [
        {
            "field": "description_ru_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0
        },
        {
            "field": "tags_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 1
        },
        {
            "field": "voice_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0
        }
    ],
    'fields': ['description_ru', 'tags', 'link'],
    '_source': 'false'
}

response = es.search(index="video-index", body=multi_query)
items = response['hits']['hits']
items

[{'_index': 'video-index',
  '_id': 'zDozG5ABoItb8IbNmhSf',
  '_score': 0.81202555,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/1c/c3/a87c36804fd68f7b4f00d9bf7dd9/fhd.mp4'],
   'description_ru': ['Автомобиль едет по улице с машиной на заднем плане. Автомобиль едет по улице рядом с деревом. Уличный знак на шесте в городе.'],
   'tags': ['#тачки #машины #обзоравто']}},
 {'_index': 'video-index',
  '_id': '0TozG5ABoItb8IbNmhSf',
  '_score': 0.81202555,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/1c/c4/aecc773349e78983bcdbcc81382c/fhd.mp4'],
   'description_ru': ['Черный автомобиль припарковался на парковке.'],
   'tags': ['#тачки #машины #обзоравто']}},
 {'_index': 'video-index',
  '_id': '4DozG5ABoItb8IbNmhSf',
  '_score': 0.81202555,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/1c/c8/39c70c5141e78e6ca24ec08b39c7/fhd.mp4'],
   'description_ru': ['Человек, сидящий на скейтборде в комнате. Женщина, 