In [126]:
import json
import os
import time

import pandas as pd

from langchain_qdrant.qdrant import QdrantVectorStore, RetrievalMode
from mistralai import Mistral
from qdrant_client import QdrantClient
from tqdm import tqdm

from app.core.embeddings import bgem3_dense, bgem3_sparse
from app.core.settings import settings

### Part 1: generate some questions

In [129]:
data_folder = 'data/test_vault'

files = os.listdir(data_folder)

text_dicts = []

for file in files:
    with open(f'{data_folder}/{file}', 'r') as content:
        text = content.read()
        if len(text)>8:
            text_dicts += [{'filename':file, 'text':text}]

print(f'Valid files total: {len(text_dicts)}')

Valid files total: 335


In [9]:
client = Mistral(api_key="EPQd9IXHTe6bCZoa4w5mzTzTxKfnGBut")

model = "open-mistral-nemo"

system = '''я разрабатываю систему для ответа на вопросы, мне нужно протестировать, правильный ли текст используется для получения ответа
сгенерируй 10 вопросов по тексту ниже на русском языке и верни их в формате json {"questions":[".."]}'''

def generate_q(text: str) -> dict:
    chat_completion = client.chat.complete(messages=[{'role':'system', 'content':system},
                                                     {'role':'user', 'content':text}], 
                                model=model,
                                response_format={ "type": "json_object" })

    return json.loads(chat_completion.choices[0].message.content)

In [83]:
for text_dict in tqdm(text_dicts, desc='Question generation'):
    time.sleep(5)
    text_dict.update({"questions":generate_q(text_dict['text'])})

Question generation: 100%|██████████| 335/335 [49:49<00:00,  8.62s/it]


In [104]:
text_question_dicts = []

for text in text_dicts:
    text_question_dicts += [{'filename':text['filename'],
                             'text':text['text'],
                             'question':text['questions']['questions'][j]} for j in range(10)]

print(f'Questions total: {len(text_question_dicts)}')

Questions total: 3350


In [None]:
# # To skip full generation step, uncomment code below
# text_question_dicts = json.load(open('data/metrics_checkpoints/texts_with_questions.json', 'r'))
# text_question_dicts[0]

### Part 2: retrieve relevant chunks

In [123]:
qvs = QdrantVectorStore(QdrantClient('localhost:6333'),
    embedding=bgem3_dense,
    vector_name="text-dense",
    sparse_embedding=bgem3_sparse,
    sparse_vector_name="text-sparse",
    collection_name=settings.collection_name,
    retrieval_mode=RetrievalMode.HYBRID,
)
retriever = qvs.as_retriever(search_type="similarity", search_kwargs={"k": 10})

for text_question_dict in tqdm(text_question_dicts, desc='Retrieving chunks'):
    docs = retriever.invoke(text_question_dict['question'])

    sources = [i.metadata['source'] for i in docs]
    target = f'/content/obsiminers/{text_question_dict['filename']}'
    rank = sources.index(target) if target in sources else -1

    text_question_dict.update({'chunks':sources, 
                               'rank':rank}) 

Retrieving chunks: 100%|██████████| 3360/3360 [21:39<00:00,  2.66it/s]


In [None]:
# # To skip full retrieval step, uncomment code below
# text_question_dicts = json.load(open('data/metrics_checkpoints/texts_with_retrieved_chunks.json', 'r'))
# text_question_dicts[0]

### Part 3: calculate recall@k, mrr

In [132]:
df = pd.DataFrame(text_question_dicts)

metrics = {}

for k in [1,3,5,10]:
    metrics[f'mrr@{k}'] = df['rank'].apply(lambda x: 1/(x+1) if x>=0 and x<k else 0).mean()
    metrics[f'recall@{k}'] = df[df['rank'].between(0,k-1)].shape[0]/df.shape[0]

metrics

{'mrr@1': 0.5782089552238806,
 'recall@1': 0.5782089552238806,
 'mrr@3': 0.6520398009950248,
 'recall@3': 0.7435820895522388,
 'mrr@5': 0.663233830845771,
 'recall@5': 0.7919402985074627,
 'mrr@10': 0.6704295190713101,
 'recall@10': 0.8450746268656717}