In [1]:
import os
import time

import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from qdrant_client.http.models import Distance

from text_splitter import CustomTextSplitter
from metrics import average_precision_at_k
from mistralai import Mistral

In [2]:
def get_sentences_embeddings_mistral(sentences, client_mistral, model_name):
    embeddings_batch_response = client_mistral.embeddings.create(
        model=model_name,
        inputs=sentences,
    )
    return [item.embedding for item in embeddings_batch_response.data]

In [3]:
def file_into_db_collection(
        root, filename, db_client, text_splitter, mistral_client,
        embedder_name, collection_name="obsidian-vault"):
    if filename.endswith(".md"):
        file_path = os.path.join(root, filename)
        with open(file_path, 'r') as file:
            content = file.read()
            if content.strip() == '':
                content += 'Bla bla'
            print(filename)
            print(len(content))
            splitted_texts = text_splitter.split_text(content)
            print(f'splitted_texts: {len(splitted_texts)}')
            time.sleep(2)
            embeds = get_sentences_embeddings_mistral(
                splitted_texts, mistral_client, embedder_name)
            payload = [{
                'file_path': file_path,
                'filename': filename,
                'text': text_chunk}
                for text_chunk in splitted_texts]
            db_client.upload_collection(
                collection_name=collection_name,
                vectors=embeds,
                payload=payload
            )


def folder_into_db_collection(
        folder_path, db_client, text_splitter, mistral_client,
        embedder_name, collection_name="obsidian-vault"):
    for root, _, files in os.walk(folder_path):
        for filename in files:
            file_into_db_collection(
                root, filename, db_client, text_splitter, mistral_client,
                embedder_name, collection_name)

In [4]:
class CustomRetriever:
    def __init__(self, qdrant_client, client, collection_name: str,
                 model_name):
        self.qdrant_client = qdrant_client
        self.client = client
        self.collection_name = collection_name
        self.model_name = model_name

    def get_relevant_documents(self, query, limit=5):
        query_embed = get_sentences_embeddings_mistral(
            [query], self.client, self.model_name)[0]
        results = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=query_embed,
            limit=limit,
        )
        return results

In [5]:
validation_data = pd.read_csv('../data/validation_dataset.csv')
questions = validation_data.question.to_list()
related_documents = validation_data.related_documents.to_list()
for i in range(len(related_documents)):
    related_documents[i] = related_documents[i].split(',')

In [6]:
url = "http://localhost:6333"
qdrant_client = QdrantClient(url)

In [7]:
embedder_names = [
    "mistral-embed",
    ]
chunk_sizes = [1024, 1536, 2048]

In [8]:
columns = ['AP_K', 'embedder_name', 'metric_name', 'chunk_size']
# df_metrics = pd.DataFrame(columns=columns)
df_metrics = pd.read_csv('metrics_mistral.csv')

In [None]:
collection_name = "obsidian-vault"
for embedder_name in embedder_names:
    api_key = '7BosjafdF1EQlDTpFWrFB7EpMWrSDyuN'
    client = Mistral(api_key=api_key)
    for chunk_size in chunk_sizes:
        VECTOR_SIZE = len(get_sentences_embeddings_mistral(
            ['Hello, world!'], client, model_name="mistral-embed")[0])
        qdrant_client.recreate_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=VECTOR_SIZE,
                distance=Distance.COSINE),
        )
        folder_into_db_collection(
            '../data/KnowledgeStore',
            qdrant_client,
            CustomTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_size // 2),
            mistral_client=client,
            embedder_name=embedder_name)
        retriever = CustomRetriever(
            qdrant_client=qdrant_client,
            client=client,
            collection_name=collection_name,
            model_name=embedder_name,
            )
        ap_K = []
        for i, question in enumerate(questions):
            search_results = retriever.get_relevant_documents(
                question, limit=10)
            time.sleep(2)
            search_results = [result.payload['filename']
                              for result in search_results]
            ap_K.append(average_precision_at_k(
                search_results, related_documents[i], 10))
        AP_K = sum(ap_K) / len(ap_K)
        new_row = pd.DataFrame((AP_K, embedder_name, 'cosine', chunk_size),
                                index=columns)
        df_metrics = pd.concat(
            [df_metrics, new_row.T], ignore_index=True)
        df_metrics.to_csv('metrics_mistral.csv', index=False)
    print(f'Finished {embedder_name}')