In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import os

csv_path = '../../../resources/union_final.csv'
host = os.environ['ELASTIC_HOST']
login = os.environ['ELASTIC_LOGIN']
password = os.environ['ELASTIC_PASSWORD']
es = Elasticsearch(host, basic_auth=(login, password))
video_index = 'video-index'
batch_size = 100

data: pd.DataFrame = pd.read_csv(csv_path, low_memory=False)



In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

print(torch.backends.mps.is_available())
device = torch.device("mps")


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = model.to(device)


def calc_vector(text):
    #Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=10000, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings[0].cpu().detach().numpy()


len(calc_vector('Привет'))

True




1024

In [3]:
def process_data(df: pd.DataFrame):
    videos = []
    for index, row in df.iterrows():
        video = {
            'index': row['index'],
            'description_ru': row['description_ru'],
            'description_ru_vector': calc_vector(row['description_ru']),
            'link': row['link'],
            'summary': str(row['short_description_ru'] or None)
        }

        tags = str(row['tags'])
        if tags != 'nan':
            video['tags'] = tags
            video['tags_vector'] = calc_vector(tags)

        voice = str(row['text'])
        if voice != 'nan':
            video['voice'] = voice
            video['voice_vector'] = calc_vector(voice)

        videos.append(video)
    helpers.bulk(es, videos, index=video_index)


from concurrent.futures import ThreadPoolExecutor
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
executor = ThreadPoolExecutor(max_workers=24)

data: pd.DataFrame = pd.read_csv(csv_path, low_memory=False)
# data = data.loc[0:100]

# process_data(data)

list_of_dfs = [data.loc[i:i + batch_size - 1, :] for i in range(0, len(data), batch_size)]

futures = []
for i in range(0, len(list_of_dfs)):
    future = executor.submit(process_data, list_of_dfs[i])
    futures.append(future)

for future in futures:
    future.result()

executor.shutdown()

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [9]:
text = 'тачки'
text_vector = calc_vector(text)

multi_query = {
    "knn": [
        {
            "field": "description_ru_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 1
        },
        {
            "field": "tags_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0.85
        },
        {
            "field": "voice_vector",
            "query_vector": text_vector,
            "k": 10,
            "boost": 0.35
        }
    ],
    'fields': ['description_ru', 'tags', 'link'],
    '_source': 'false'
}

response = es.search(index="video-index", body=multi_query)
items = response['hits']['hits']
items

[{'_index': 'video-index',
  '_id': 'Sj3IG5ABoItb8IbNRk3S',
  '_score': 0.8367896,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/95/44/66f790204ea78901b17d890a1e60/fhd.mp4'],
   'description_ru': ['Машина с человеком на заднем сиденье и мотоцикл на заднем сиденье.'],
   'tags': ['#boobs , #красивыедевушки , #ass']}},
 {'_index': 'video-index',
  '_id': 'nTyfG5ABoItb8IbNFEsP',
  '_score': 0.83467627,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/74/dd/8d140dc64ffcb151f520ec5dea5c/fhd.mp4'],
   'description_ru': ['Карусель машин и грузовиков в гараже.'],
   'tags': ['#тачки #машины #обзоравто']}},
 {'_index': 'video-index',
  '_id': '-TptG5ABoItb8IbNIfkQ',
  '_score': 0.8152423,
  '_source': {},
  'fields': {'link': ['https://cdn-st.rutubelist.ru/media/84/89/587c358546828bf85732a0e3cae5/fhd.mp4'],
   'description_ru': ['Автомобиль с красной и белой машиной на нем.'],
   'tags': ['#спорт #наспорте #футбол #прикол']}},
 {'_index': '