In [25]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from qdrant_client.http.models import Distance

from text_splitter import CustomTextSplitter
from retriever import CustomRetriever
from utils import folder_into_db_collection
from metrics import average_precision_at_k

In [26]:
validation_data = pd.read_csv('../data/validation_dataset.csv')
questions = validation_data.question.to_list()
related_documents = validation_data.related_documents.to_list()
for i in range(len(related_documents)):
    related_documents[i] = related_documents[i].split(',')

In [27]:
url = "http://localhost:6333"
qdrant_client = QdrantClient(url)

def get_sentences_embeddings(sentences, st_model):
    return st_model.encode(sentences)

In [28]:
similarity_metrics = [
    (Distance.COSINE, 'cosine'), 
    (Distance.EUCLID, 'L2'),
    ]
embedder_names = [
    # 'intfloat/multilingual-e5-large',
    # 'intfloat/multilingual-e5-base',
    # 'intfloat/multilingual-e5-small',
    # 'deepvk/USER-bge-m3',
    # 'deepvk/USER-base',
    # 'ai-forever/ru-en-RoSBERTa',
    # 'sergeyzh/LaBSE-ru-turbo',
    'sergeyzh/rubert-tiny-turbo',
    # 'cointegrated/LaBSE-en-ru',
    'cointegrated/rubert-tiny2',
    ]
chunk_sizes = [512, 1024, 1536, 2048]


In [29]:
columns = ['AP_K', 'embedder_name', 'metric_name', 'chunk_size']
df_metrics = pd.DataFrame(columns=columns)

In [30]:
collection_name = "obsidian-vault"
for embedder_name in embedder_names:
    for metric in similarity_metrics:
        metric_name = metric[1]
        for chunk_size in chunk_sizes:
            sentence_embedder = SentenceTransformer(embedder_name, device="cpu")
            VECTOR_SIZE = get_sentences_embeddings(
                'Hello, world!', sentence_embedder).shape[0]
            qdrant_client.recreate_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(
                    size=VECTOR_SIZE,
                    distance=metric[0]),
            )
            folder_into_db_collection(
                '../data/KnowledgeStore',
                qdrant_client,
                CustomTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_size // 2),
                sentence_embedder)
            retriever = CustomRetriever(
                qdrant_client=qdrant_client,
                embedder=sentence_embedder,
                collection_name=collection_name
                )
            ap_K = []
            for i, question in enumerate(questions):
                search_results = retriever.get_relevant_documents(question, limit=10)
                search_results = [result.payload['filename'] for result in search_results]
                ap_K.append(average_precision_at_k(
                    search_results, related_documents[i], 10))
            AP_K = sum(ap_K) / len(ap_K)
            new_row = pd.DataFrame((AP_K, embedder_name, metric_name, chunk_size),
                                 index=columns)
            df_metrics = pd.concat(
                [df_metrics, new_row.T], ignore_index=True)
            df_metrics.to_csv('metrics.csv', index=False)
    print(f'Finished {embedder_name}')

  qdrant_client.recreate_collection(
Created a chunk of size 1106, which is longer than the specified 512
Created a chunk of size 677, which is longer than the specified 512
Created a chunk of size 1182, which is longer than the specified 512
Created a chunk of size 956, which is longer than the specified 512
Created a chunk of size 1619, which is longer than the specified 512
Created a chunk of size 551, which is longer than the specified 512
Created a chunk of size 564, which is longer than the specified 512
Created a chunk of size 697, which is longer than the specified 512
Created a chunk of size 2187, which is longer than the specified 512
Created a chunk of size 570, which is longer than the specified 512
Created a chunk of size 523, which is longer than the specified 512
Created a chunk of size 527, which is longer than the specified 512
Created a chunk of size 1629, which is longer than the specified 512
Created a chunk of size 529, which is longer than the specified 512
Create

Finished sergeyzh/rubert-tiny-turbo


  qdrant_client.recreate_collection(
Created a chunk of size 1106, which is longer than the specified 512
Created a chunk of size 677, which is longer than the specified 512
Created a chunk of size 1182, which is longer than the specified 512
Created a chunk of size 956, which is longer than the specified 512
Created a chunk of size 1619, which is longer than the specified 512
Created a chunk of size 551, which is longer than the specified 512
Created a chunk of size 564, which is longer than the specified 512
Created a chunk of size 697, which is longer than the specified 512
Created a chunk of size 2187, which is longer than the specified 512
Created a chunk of size 570, which is longer than the specified 512
Created a chunk of size 523, which is longer than the specified 512
Created a chunk of size 527, which is longer than the specified 512
Created a chunk of size 1629, which is longer than the specified 512
Created a chunk of size 529, which is longer than the specified 512
Create

Finished cointegrated/rubert-tiny2
