# Summarizer - Evaluate - Proof of Concept

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import json
import time
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

from llm import openai_llm, extract_from_llm_output
from vectordb import (
    MilvusClientFix,
    create_schema,
    create_index_params,
)




In [3]:
nb_path = Path()

In [4]:
load_dotenv(nb_path / "../.env", verbose=True)

True

In [5]:
hn_dump_file = "hn_news.json"
lr_dump_file = "lr_news.json"

In [6]:
started_at = time.time()

## Prepare data

In [7]:
def load_stored(file_path: str) -> list:
    stored = []
    try:
        with open(file_path, "r") as fp:
            stored = json.load(fp)
    except (FileNotFoundError, json.JSONDecodeError):
        pass
    
    return stored

stored_hn = load_stored(hn_dump_file)
stored_lr = load_stored(lr_dump_file)

In [8]:
stored_documents = stored_hn + stored_lr

## Derive embeddings

In [9]:
import embeddings

In [10]:
embedding_dim = embeddings.get_dimensions()
embedding_dim

768

## LLM operations

In [11]:
openai_client = OpenAI()

For every relevant fragment find most related documents from the whole history and provide a perspective on the topic.


## Build ground truth dataset
That needs to be run only once to build the dataset.

In [12]:
import random

In [13]:
ground_truth_filename = "summarizer_ground_truth.json"
eval_subset_filename = "summarizer_eval_subset.json"

Take a subset of stored data, summarize each item using LLM into short tag-like strings, then map summaries to original IDs.

In [14]:
ground_truth_data = load_stored(ground_truth_filename)
eval_subset = load_stored(eval_subset_filename)

NUM_GROUND_TRUTH = 50

if not eval_subset:
    samples = random.sample(stored_documents, NUM_GROUND_TRUTH)
    
    uids = []
    
    # leave only unique fields
    for sample in samples:
        if sample["document_uid"] not in uids:
            uids.append(sample["document_uid"])
            eval_subset.append(sample)
            
    with open(eval_subset_filename, "w") as fp:
        json.dump(eval_subset, fp, indent=2)


Put subset of data into the vector DB for evaluation

In [15]:
collection_name = "llm_summarizer_eval"
collection_db_path = "./milvus_summarizer_eval.db"
MAX_CONTENT_VECTORIZED = 5 * 1000

milvus_client = MilvusClientFix.get_instance(collection_db_path)

if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [16]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    schema=create_schema("LLM Summarizer Evaluation", embedding_dim, MAX_CONTENT_VECTORIZED),
    index_params=create_index_params(milvus_client),
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)

In [17]:
to_insert = []
for doc in tqdm(eval_subset, desc="Creating embeddings for evaluation"):
    # Transforming text into an embedding using the model
    doc["text"] = doc["content"]
    doc["vector"] = embeddings.get_embeddings(doc["content"])
    to_insert.append(doc)

Creating embeddings for evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
milvus_client.insert(collection_name=collection_name, data=to_insert)

{'insert_count': 50, 'ids': ['5683cef805', 'b264c1a1eb', '818ee3f8a6', 'f1c414e08b', 'f2a0638fc2', '9a1527389d', 'aef0e04585', '9350620c94', 'f998f36cf2', '960cf152c8', '8511fd5f79', '62cce97bab', 'd54169bec5', 'e305a722f3', '605b664d56', '3bcbc4117c', 'c95147bb70', 'b73841193f', 'f17d41b63b', '28d75f0ae5', '4482a3d66d', '1739a8b952', 'd3ebf64b12', '9bf61b04fa', '99dfafee8b', 'e57a204f75', '36327d036d', 'a033a57049', 'ae81cf4fc3', 'd5d3474a2b', '7161b65a09', 'baed771279', 'd6df169c03', '21b1c97963', '3210d8a096', 'adb4c4f1f9', '3614585ef1', '74529f6553', '3f7306603b', '0b6da052f8', 'e406a7ef42', '48b7491df1', 'ad6d63cae6', 'a2342caa7f', '24fd4b247d', 'd860c358e5', '3411e55fe0', '1259195d10', 'f7b07bfcb5', '8a0faadb87'], 'cost': 0}

Create ground truth dataset if not exists.

In [19]:
def build_query_prompt(text: str) -> str:
    prompt_template = """
You're the skilled specialist. Provide one or several short summaries each from 2 to 5 words for the TEXT_FRAGMENT (like tags). 
Please disregard html tags or error messages in the TEXT_FRAGMENT. If the TEXT_FRAGMENT is an error message, please disregard it.
Provide the output as json array of strings.

TEXT_FRAGMENT: {text}
""".strip()

    prompt = prompt_template.format(text=text).strip()
    return prompt

In [20]:
if not ground_truth_data:
    ground_truth_data = []
    for entry in eval_subset:
        doc_uid = entry["document_uid"]
        content = entry["content"]
        
        llm_result = openai_llm(openai_client, build_query_prompt(content))
        queries = extract_from_llm_output(llm_result)
        for query in queries:
            ground_truth_data.append({"document_uid": doc_uid, "query": query})
        
        # to avoid LLM rate limits
        time.sleep(5)
        
    with open(ground_truth_filename, "w") as fp:
        json.dump(ground_truth_data, fp, indent=2)

In [21]:
ground_truth_data[0:3]

[{'document_uid': '5683cef805', 'query': 'Hiring importance'},
 {'document_uid': '5683cef805', 'query': 'Recruitment strategy'},
 {'document_uid': '5683cef805', 'query': "Founders' time management"}]

## Evaluate retrieval efficiency
Prepare ground truth data. Then evaluate the retrieval efficiency.

In [22]:
truth_df = pd.DataFrame.from_records(ground_truth_data)
truth_df.head()


Unnamed: 0,document_uid,query
0,5683cef805,Hiring importance
1,5683cef805,Recruitment strategy
2,5683cef805,Founders' time management
3,5683cef805,Candidate experience
4,5683cef805,Startup growth


In [23]:
def hit_rate(relevance_total: list[list[bool]]) -> float:
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [24]:
def mrr(relevance_total: list[list[bool]]) -> float:
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [25]:
def evaluate(ground_truth: dict, search_function: callable) -> dict:
    """Evaluate the search function using the ground truth data
    
    Args:
        ground_truth: list of dictionaries with 'query', and 'document_uid' keys
        search_function: callable that takes a string query and returns a list of search results
    """
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_uid = q['document_uid']
        results = search_function(q['query'])
        relevance = [d['id'] == doc_uid for d in results[0]]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
ground_truth = truth_df.to_dict(orient="records")
ground_truth

[{'document_uid': '5683cef805', 'query': 'Hiring importance'},
 {'document_uid': '5683cef805', 'query': 'Recruitment strategy'},
 {'document_uid': '5683cef805', 'query': "Founders' time management"},
 {'document_uid': '5683cef805', 'query': 'Candidate experience'},
 {'document_uid': '5683cef805', 'query': 'Startup growth'},
 {'document_uid': '5683cef805', 'query': 'Early recruiting value'},
 {'document_uid': '5683cef805', 'query': 'Hiring process efficiency'},
 {'document_uid': '5683cef805', 'query': 'Technical talent'},
 {'document_uid': '5683cef805', 'query': 'Team building'},
 {'document_uid': '5683cef805', 'query': 'Recruiter partnerships'},
 {'document_uid': 'b264c1a1eb', 'query': 'lab environments'},
 {'document_uid': 'b264c1a1eb', 'query': 'laptop setup'},
 {'document_uid': 'b264c1a1eb', 'query': 'software testing'},
 {'document_uid': 'b264c1a1eb', 'query': 'multi-distro support'},
 {'document_uid': 'b264c1a1eb', 'query': 'improving robustness'},
 {'document_uid': '818ee3f8a6', 

In [27]:
milvus_search_fn = lambda query: milvus_client.search(
    collection_name=collection_name,
    data=[
        embeddings.get_embeddings(query)
    ],
    limit=5,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["document_uid"],
)

In [28]:
evaluate(ground_truth, milvus_search_fn)

  0%|          | 0/306 [00:00<?, ?it/s]

{'hit_rate': 0.8986928104575164, 'mrr': 0.7782679738562093}

In [29]:
completed_at = time.time()

time_spent = (completed_at - started_at)

print(f"Completed at {datetime.now()}, execution took ~{int(time_spent / 60)} min")
print(f"Number of stored HN entries: {len(stored_hn)}")
print(f"Number of stored Lobste.rs entries: {len(stored_lr)}")

Completed at 2024-10-29 13:38:16.514702, execution took ~0 min
Number of stored HN entries: 220
Number of stored Lobste.rs entries: 70


In [31]:
milvus_client.close()