In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers

csv_path = '../../../resources/union_4.csv'
es = Elasticsearch('http://87.242.93.110:9200', basic_auth=('elastic', 'T86zNAvyHwbAr4S'))
video_index = 'video-index-4'
batch_size = 10

data: pd.DataFrame = pd.read_csv(csv_path)




In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

print(torch.backends.mps.is_available())
device = torch.device("mps")


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = model.to(device)


def get_embedding_3(text):
    #Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=10000, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings[0].cpu().detach().numpy()


len(get_embedding_3('Привет'))

True


1024

In [None]:
def process_data(df: pd.DataFrame):
    videos = []
    for index, row in df.iterrows():
        video = {
            'index': row['index'],
            'description_ru': row['description_ru'],
            'description_ru_vector': get_embedding_3(row['description_ru']),
            'link': row['link'],
            'voice_vector': get_embedding_3(str(row['text'] or '')),
            'tags_vector': get_embedding_3(str(row['tags'] or '')),
            'tags': str(row['tags'] or None),
            'voice': str(row['text'] or None),
            'summary': str(row['short_description_ru'] or None)
        }

        # if isinstance(row['text'], str) and len(row['text']) > 0:
        #     video['voice_vector'] = get_embedding_3(row['text'])

        videos.append(video)
    helpers.bulk(es, videos, index=video_index)


from concurrent.futures import ThreadPoolExecutor
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
executor = ThreadPoolExecutor(max_workers=12)

data: pd.DataFrame = pd.read_csv(csv_path)
# data = data.loc[370:370]

# process_data(data)

list_of_dfs = [data.loc[i:i + batch_size - 1, :] for i in range(0, len(data), batch_size)]

futures = []
for i in range(0, len(list_of_dfs)):
    future = executor.submit(process_data, list_of_dfs[i])
    futures.append(future)

for future in futures:
    future.result()

In [4]:
text = 'тачки'
text_vector = get_embedding_3(text)

query = {
    'knn': {
        'field': 'vector',
        # 'field': 'voice_vector',
        # 'field': 'description_ru_vector',
        'query_vector': text_vector,
        # 'k': 10,
        # 'num_candidates': 100
    },
    'fields': ['description_ru', 'link'],
    '_source': 'false'
}

multi_query = {
    "knn": [
        {
            "field": "description_ru_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0
        },
        {
            "field": "tags_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 1
        },
        {
            "field": "voice_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0
        }
    ],
    'fields': ['description_ru', 'link'],
    '_source': 'false'
}

response = es.search(index="video-index-3", body=multi_query)
items = response['hits']['hits']
items

[{'_index': 'video-index-3',
  '_id': 'PzexA5ABoItb8IbNe19R',
  '_score': 0.81299543,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/02/59/21de5421418188fbee19966f4dd3/fhd.mp4'],
   'description_ru': ['Мотоцикл с красным сиденьем и красным рулем. Красный мотоцикл с рулем и ручкой руля. Машина с камерой на заднем сиденье. Женщина в красном топе сидит на красном мотоцикле. Женщина в черном платье, стоящая рядом с мотоциклом.']}},
 {'_index': 'video-index-3',
  '_id': '4zesA5ABoItb8IbNVUnt',
  '_score': 0.8122759,
  '_ignored': ['description_ru.keyword'],
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/01/73/4a9e04e44b46956392585327ca69/fhd.mp4'],
   'description_ru': ['Автомобиль едет по улице с птицей, летящей в воздухе. Автомобиль едет по улице с птицей на лобовом стекле. Черный автомобиль припарковался перед зданием. Автомобиль с черно -белой машиной на заднем плане. Автомобиль припаркован перед зданием с черной машиной. Автомобил