# Summarizer - User Interface - Proof of Concept

In [None]:
%load_ext autoreload

%autoreload 2

In [41]:
import json
from datetime import date, datetime
from functools import partial
from pathlib import Path

from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

from llm import NoRelevantDataFoundError, extract_from_llm_output, openai_llm, rag
from vectordb import MilvusClientFix, milvus_search


In [2]:
nb_path = Path()

In [3]:
load_dotenv(nb_path / "../.env", verbose=True)

True

## Configuration and initialization

In [24]:
hn_dump_file = "hn_news.json"
lr_dump_file = "lr_news.json"

collection_name = "llm_summarizer_poc"
collection_db_path = "./milvus_summarizer.db"

In [5]:
milvus_client = MilvusClientFix.get_instance(collection_db_path)

In [6]:
openai_client = OpenAI()

In [7]:
milvus_search_fn = partial(milvus_search, milvus_client, collection_name)

In [8]:
openai_llm_fn = partial(openai_llm, openai_client)

In [9]:
def build_summary_prompt(query: str, search_results: list[dict]) -> str:
    prompt_template = """
You're the skilled specialist. Summarize the most important points from the CONTEXT that might be useful or interesting for a specialist and related to  QUERY. 
Use only the facts from the CONTEXT when finding relevancy but provide some comparative summary with the state-of-the-arts if possible.
If the context fragment does not have close relation to the query, provide a short note why a fragment is not relevant.
Provide the output as JSON with the list of dictionaries with the following fields: fragment_id, summary, is_relevant. Value in is_relevant should be True if the fragment is relevant to the KEYWORDS and False otherwise.

QUERY: {query}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for idx, doc in enumerate(search_results):
        context = context + f"FRAGMENT_{doc['document_uid']}: {doc['text']}\n\n"
    
    prompt = prompt_template.format(query=query, context=context).strip()
    return prompt


In [10]:
query = "Data Engineering"
start_date = None
end_date = None

In [11]:
rag_summary = rag(query, build_summary_prompt, openai_llm_fn, milvus_search_fn, start_dt=start_date, end_dt=end_date)

In [12]:
rag_summary

'```json\n[\n    {\n        "fragment_id": "FRAGMENT_47fa612996",\n        "summary": "The trend of automated reasoning in software development, particularly at AWS, shows that formally verified code often outperforms unverified counterparts. Automated reasoning enhances system performance by verifying code correctness under various scenarios, thereby boosting developer confidence in optimization. This approach addresses the complexities and potential bugs in large-scale systems, replacing traditional testing methods with logical proofs of correctness. By moving towards a mathematical view of system specifications, it allows for handling complex, high-scale environments effectively.",\n        "is_relevant": true\n    },\n    {\n        "fragment_id": "FRAGMENT_37278e50d8",\n        "summary": "This fragment discusses a fork of OpenTofu that uses CUE as a substitute for HCL in tools like Terraform and Helmfiles. While CUE may relate to data configuration, it lacks a direct connection t

For every relevant fragment find most related documents from the whole history and provide a perspective on the topic.


In [16]:
rag_cleaned = extract_from_llm_output(rag_summary)
rag_cleaned

[{'fragment_id': 'FRAGMENT_47fa612996',
  'summary': 'The trend of automated reasoning in software development, particularly at AWS, shows that formally verified code often outperforms unverified counterparts. Automated reasoning enhances system performance by verifying code correctness under various scenarios, thereby boosting developer confidence in optimization. This approach addresses the complexities and potential bugs in large-scale systems, replacing traditional testing methods with logical proofs of correctness. By moving towards a mathematical view of system specifications, it allows for handling complex, high-scale environments effectively.',
  'is_relevant': True},
 {'fragment_id': 'FRAGMENT_37278e50d8',
  'summary': 'This fragment discusses a fork of OpenTofu that uses CUE as a substitute for HCL in tools like Terraform and Helmfiles. While CUE may relate to data configuration, it lacks a direct connection to the core aspects of data engineering such as data integration, pr

In [38]:
from more_itertools import chunked

MAX_SCOPE = 100
BATCH_SIZE = 10

def rag_batched(
    query: str,
    prompt_fn: callable,
    llm_fn: callable,
    search_fn: callable,
    num_results: int = MAX_SCOPE,
    batch_size: int = BATCH_SIZE,
    start_dt: datetime | None = None,
    end_dt: datetime | None = None
) -> list[dict]:
    """Return relevant answers built using RAG.
    
    The goal is to find relevant documents and disregard irrelevant ones.
    Take a lot of documents, split them into batches. Assume that documents are ordered by "distance" between embeddings. If two consecutive batches do not contain relevant fragments, stop the process.
    
    LLM Response should contain a list of dictionaries with at least "is_relevant" field. 
    """
    search_results = search_fn(
        query=query,
        num_results=num_results,
        start_dt=start_dt,
        end_dt=end_dt
    )
    
    if not search_results:
        raise NoRelevantDataFoundError("No relevant results found.")
    
    prev_batch_relevant = True
    relevant_results = []
    for batch in tqdm(chunked(search_results, batch_size)):
        prompt = prompt_fn(query, batch)
        answer = llm_fn(prompt)
        
        cleaned = extract_from_llm_output(answer)
        count_relevant = 0
        for item in cleaned:
            if item["is_relevant"]:
                relevant_results.append(item)
                count_relevant += 1
                
        if count_relevant == 0:
            if not prev_batch_relevant:
                break
            
            prev_batch_relevant = False

    return relevant_results

In [28]:
extended_summary = rag_batched(query, build_summary_prompt, openai_llm_fn, milvus_search_fn, start_dt=start_date, end_dt=end_date)

extended_summary

0it [00:00, ?it/s]

[{'fragment_id': '47fa612996',
  'summary': "Automated reasoning has improved the performance and maintainability of AWS's complex distributed systems, allowing for more efficient bug fixes and optimizations beyond traditional testing methods. This approach is especially useful for large-scale, fault-tolerant architectures, enhancing system correctness and developer confidence.",
  'is_relevant': True},
 {'fragment_id': '0ddfee1b01',
  'summary': 'The Supergraph Manifesto outlines an architecture framework for API integration and federated data access, emphasizing self-service platforms for data access. It relates to data engineering in the context of building accessible data architectures and efficient data sharing practices.',
  'is_relevant': True},
 {'fragment_id': 'ad243e2f3b',
  'summary': "The data breach at the Internet Archive highlights significant issues in data security practices, revealing vulnerabilities in log management and user credential protection. The incident, whic

In [36]:
len(extended_summary)

33

In [54]:
# add original references to the data found
def format_extended_summary(query: str, extended_summary: list[dict], original_data: list[dict]) -> str:
    """Pretty print the extended summary."""
    if not extended_summary:
        out = f"Query: **{query}**\n\nNo relevant data found."
        return out
    
    out = f"Query: **{query}**\n\nThe following posts found:\n\n"
    
    urls = []
    
    for entry in extended_summary:
        doc_uid = entry["fragment_id"]
        if doc_uid.startswith("FRAGMENT_"):
            doc_uid = doc_uid.removeprefix("FRAGMENT_")
        summary = entry["summary"]
        
        # add from original data
        for doc in original_data:
            if doc["document_uid"] == doc_uid:
                ref = doc["url"]
                title = doc["title"]
                if ref not in urls:
                    urls.append(ref)
                    out += f"[{title}]({ref})\n\n{summary}\n\n"
                
                break

    return out


In [55]:
def load_stored(file_path: str) -> list:
    stored = []
    try:
        with open(file_path, "r") as fp:
            stored = json.load(fp)
    except (FileNotFoundError, json.JSONDecodeError):
        pass
    
    return stored

stored_data = load_stored(hn_dump_file) + load_stored(lr_dump_file)

## The working example

In [60]:
query = "physics challenges"
start_dt = datetime(2024, 10, 28, 0, 0)
end_dt = datetime(2024, 11, 1, 0, 0)

In [61]:
from IPython.display import display, Markdown

out = format_extended_summary(
    query,
    rag_batched(query, build_summary_prompt, openai_llm_fn, milvus_search_fn, start_dt=start_dt, end_dt=end_dt),
    stored_data
)

display(Markdown(out))

0it [00:00, ?it/s]

Query: **physics challenges**

The following posts found:

[Becoming physically immune to brute-force attacks](https://seirdy.one/posts/2021/01/12/password-strength/)

This fragment discusses the relationship between thermal physics, cosmology, and computer science in determining password strength against brute-force attacks. It introduces the concept of the 'Mother of All Computers' (MOAC), which serves as a theoretical limit for computational power based on mass-energy conservation. The discussion emphasizes the need for updated recommendations for password strength to resist future computational advances, particularly from supercomputers and quantum computers using Grover's algorithm.

