In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from qdrant_client.http.models import Distance

from embedder import get_sentences_embeddings
from text_splitter import CustomTextSplitter
from retriever import CustomRetriever
from utils import folder_into_db_collection
from metrics import average_precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
validation_data = pd.read_csv('../data/validation_dataset.csv')
questions = validation_data.question.to_list()
related_documents = validation_data.related_documents.to_list()
for i in range(len(related_documents)):
    related_documents[i] = related_documents[i].split(',')

In [3]:
url = "http://localhost:6333"
qdrant_client = QdrantClient(url)

In [4]:
similarity_metrics = [
    (Distance.COSINE, 'cosine'), 
    ]

no_prefix = {'query': '', 'instance': ''}

embedder_prefix = {
    'intfloat/multilingual-e5-large': {'query': 'query: ',
                                       'instance': 'passage: '},
    'intfloat/multilingual-e5-base': {'query': 'query: ',
                                       'instance': 'passage: '},
    'intfloat/multilingual-e5-small': {'query': 'query: ',
                                       'instance': 'passage: '},
    'ai-forever/ru-en-RoSBERTa': {'query': 'search_query: ', 
                                  'instance': 'search_document: '},
    'deepvk/USER-bge-m3': no_prefix,
    'deepvk/USER-base': no_prefix,
    'sergeyzh/LaBSE-ru-turbo': no_prefix,
    'cointegrated/LaBSE-en-ru': no_prefix,
    'sergeyzh/rubert-tiny-turbo': no_prefix,
    'cointegrated/rubert-tiny2': no_prefix,
}

embedder_names = [
    'intfloat/multilingual-e5-large',
    'intfloat/multilingual-e5-base',
    'intfloat/multilingual-e5-small',
    'ai-forever/ru-en-RoSBERTa',
    'deepvk/USER-bge-m3',
    'deepvk/USER-base',
    'sergeyzh/LaBSE-ru-turbo',
    'cointegrated/LaBSE-en-ru',
    'sergeyzh/rubert-tiny-turbo',
    'cointegrated/rubert-tiny2',
    ]
chunk_sizes = [512, 1024, 1536, 2048]


In [5]:
columns = ['AP_K', 'embedder_name', 'metric_name', 'chunk_size']
# df_metrics = pd.DataFrame(columns=columns)
df_metrics = pd.read_csv('metrics.csv')

In [None]:
collection_name = "obsidian-vault"
for embedder_name in embedder_names:
    sentence_embedder = SentenceTransformer(embedder_name)
    query_prefix = embedder_prefix[embedder_name]['query']
    instance_prefix = embedder_prefix[embedder_name]['instance']
    for metric in similarity_metrics:
        metric_name = metric[1]
        for chunk_size in chunk_sizes:
            VECTOR_SIZE = get_sentences_embeddings(
                ['Hello, world!'], sentence_embedder, prefix=instance_prefix).shape
            qdrant_client.recreate_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(
                    size=VECTOR_SIZE[1],
                    distance=metric[0]),
            )
            folder_into_db_collection(
                '../data/KnowledgeStore',
                qdrant_client,
                CustomTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_size // 2),
                sentence_embedder,
                prefix=instance_prefix)
            retriever = CustomRetriever(
                qdrant_client=qdrant_client,
                embedder=sentence_embedder,
                collection_name=collection_name,
                prefix=query_prefix,
                )
            ap_K = []
            for i, question in enumerate(questions):
                search_results = retriever.get_relevant_documents(question, limit=10)
                search_results = [result.payload['filename'] for result in search_results]
                ap_K.append(average_precision_at_k(
                    search_results, related_documents[i], 10))
            AP_K = sum(ap_K) / len(ap_K)
            new_row = pd.DataFrame((AP_K, embedder_name, metric_name, chunk_size),
                                 index=columns)
            df_metrics = pd.concat(
                [df_metrics, new_row.T], ignore_index=True)
            df_metrics.to_csv('metrics.csv', index=False)
    print(f'Finished {embedder_name}')