In [58]:
import nest_asyncio
nest_asyncio.apply()
import asyncio


In [36]:
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

mkdir: data: File exists
--2025-04-28 16:59:34--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.67.42, 151.101.195.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2025-04-28 16:59:34--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’


2025-04-28 16:59:39 (2.81 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]



In [37]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

In [38]:
from dotenv import load_dotenv

load_dotenv()

True

In [39]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model="text-embedding-3-small", embed_batch_size=256)

In [40]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter], emded_model=embed_model)


## Step1 : Query generation/ rewriting

The first step is to generate queries from the original query to better match the query intent, and increase precision/recall of the retrieved results. For instance, we might be able to rewrite the query into smaller queries.

We can do this by prompting ChatGPT.

In [41]:
from llama_index.core import PromptTemplate


In [42]:
query = "How do the models developed in this work compare to open-source chat models based on the benchmarks tested?"

In [43]:
queryGenPromptStr = (
    "You are a helpful assistant that generates multiple search queries based on a "
    "single input query. Generate {num_queries} search queries, one on each line, "
    "related to the following input query:\n"
    "Query: {query}\n"
    "Queries:\n"
)
QueryGenPrompt = PromptTemplate(queryGenPromptStr)

In [44]:
QueryGenPrompt

PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['num_queries', 'query'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='You are a helpful assistant that generates multiple search queries based on a single input query. Generate {num_queries} search queries, one on each line, related to the following input query:\nQuery: {query}\nQueries:\n')

In [45]:
def generateQueries(llm, query, numQueries=4):
    queryGenPrompt = queryGenPromptStr.format(num_queries=numQueries - 1, query=query)
    response = llm.complete(queryGenPrompt)
    queries = response.text.split("\n")
    return queries

In [46]:
queries = generateQueries(llm, query, numQueries=4)

In [47]:
queries

['1. Comparison of models developed in this work to open-source chat models in terms of benchmark performance',
 '2. Evaluation of open-source chat models against models developed in this work using benchmark tests',
 '3. Analysis of differences between models developed in this work and open-source chat models in benchmark testing results']

### step 2: Perform vector search for each query 
Now we run retrieval for each query. This means that we fetch the top-k most relevant results from each vector store.

NOTE: We can also have multiple retrievers. Then the total number of queries we run is NM, where N is number of retrievers and M is number of generated queries. Hence there will also be NM retrieved lists.

In [48]:
from tqdm.asyncio import tqdm

async def runQuery(queries, retrivers):

    tasks = []
    for query in queries:
        for i, retriever in enumerate(retrivers):
            print(i, "retriver count")
            tasks.append(retriever.aretrieve(query))
    print(tasks)
    taskResults = await tqdm.gather(*tasks)
    resultsDict = {}
    for i, (query, queryResult) in enumerate(zip(queries, taskResults)):
        resultsDict[(query, i)] = queryResult

    return resultsDict

In [49]:
from llama_index.retrievers.bm25 import BM25Retriever

vector_retriever = index.as_retriever(similarity_top_k=2)

bm25_retriever = BM25Retriever.from_defaults(docstore=index.docstore, similarity_top_k=2)

In [50]:
results_dict = await runQuery(queries, [vector_retriever, bm25_retriever])

0 retriver count
1 retriver count
0 retriver count
1 retriver count
0 retriver count
1 retriver count
[<coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e6f80>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e6340>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e6ff0>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e7060>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e7140>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1685e71b0>]


100%|██████████| 6/6 [00:03<00:00,  1.71it/s]


In [67]:
for i,j in results_dict.items():
    print(i)
    for k in j:
        print(k)
    print("***************")

('1. Comparison of models developed in this work to open-source chat models in terms of benchmark performance', 0)
Node ID: 9ef63423-c0a7-408e-959d-bca06160b455
Text: Figure 1: Helpfulness human evaluation results for Llama 2-Chat
compared to other open-source and closed-source models. Human raters
compared model generations on ~4k prompts consisting of both single
and multi-turn prompts. The 95% confidence intervals for this
evaluation are between 1% and 2%. More details in Section 3.4.2. While
reviewing the...
Score:  0.845

Node ID: 95ecc7d2-594f-4121-bcf9-424c906374d6
Text: Figure 12: Human evaluation results for Llama 2-Chat models
compared to open- and closed-source models across ~4,000 helpfulness
prompts with three raters per prompt. The largest Llama 2-Chat model
is competitive with ChatGPT. Llama 2-Chat 70B model has a win rate of
36% and a tie rate of 31.5% relative to ChatGPT. Llama 2-Chat 70B
model outperf...
Score:  0.843

***************
('2. Evaluation of open-source ch

###  Step 3: Perform fusion

--> combine results from all the retrievers into one 
--> remove duplicate that came out along the way
--> re rank the nodes based reciprocal rank fusion
--> Sumation of 1/ (k+r)
--> Reorder nodes by highest to least

In [52]:
from typing import List
from llama_index.core.schema import NodeWithScore

def fuseResults(resultsDict, similarity_top_k=2):
    
    k = 60.0
    fusedScores = {}
    textToNode = {}

    # compute the reciprocal rand scores
    for nodesWithScore in resultsDict.values():
        for rank, nodesWithScore in enumerate(
            sorted(
                nodesWithScore, key=lambda x: x.score or 0.0, reverse=True
            )
        ):
            text = nodesWithScore.node.get_content()
            textToNode[text] = nodesWithScore.node
            if text not in fusedScores:
                fusedScores[text] = 0.0
            fusedScores[text] += 1.0 / (rank + k)
    
    # sort results
    rerankedResults = dict(sorted(fusedScores.items(), key=lambda x:x[1], reverse=True))

    rerankedNodes : List[NodeWithScore] = []
    for text, score in rerankedResults.items():
        rerankedNodes.append(NodeWithScore(node = textToNode[text], score=score))
        # print(rerankedNodes[-1])

    return rerankedNodes[:similarity_top_k]


In [53]:
finalResults = fuseResults(results_dict)

In [None]:


for n in finalResults:
   print(n.score, "\n", n.text, "\n********\n")

0.04972677595628415 
 Figure 1: Helpfulness human evaluation results for Llama
2-Chat compared to other open-source and closed-source
models. Human raters compared model generations on ~4k
prompts consisting of both single and multi-turn prompts.
The 95% confidence intervals for this evaluation are between
1% and 2%. More details in Section 3.4.2. While reviewing
these results, it is important to note that human evaluations
can be noisy due to limitations of the prompt set, subjectivity
of the review guidelines, subjectivity of individual raters,
and the inherent difficulty of comparing generations.
Figure 2: Win-rate % for helpfulness and
safety between commercial-licensed base-
lines and Llama 2-Chat, according to GPT-
4. To complement the human evaluation, we
used a more capable model, not subject to
our own guidance. Green area indicates our
model is better according to GPT-4. To remove
ties, we used win/(win + loss). The orders in
which the model responses are presented to
GPT-4 a

In [68]:
len(finalResults)

2

## Lets plug this into a retriver query engine 

In [55]:
from typing import List
from llama_index.core.schema import NodeWithScore
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever



class FusedRetriever(BaseRetriever):
    """Ensemble retriever with fusion."""

    def __init__(self, llm, retrievers: List[BaseRetriever], similarity_top_k=2):
        self._llm = llm
        self._retrievers = retrievers
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:

        queries = generateQueries(self._llm, query_bundle.query_str, numQueries=4)
        results = asyncio.run(runQuery(queries, self._retrievers))

        finalResults = fuseResults(results, similarity_top_k=self._similarity_top_k)
        return finalResults
        

        

In [56]:
from llama_index.core.query_engine import RetrieverQueryEngine

fusionRetriever = FusedRetriever(llm, [vector_retriever, bm25_retriever], similarity_top_k=2)

query_engine = RetrieverQueryEngine(fusionRetriever)


In [59]:
response = query_engine.query(query)

0 retriver count
1 retriver count
0 retriver count
1 retriver count
0 retriver count
1 retriver count
[<coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1686810e0>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168681150>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x1686811c0>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168681070>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168681230>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168681310>]


100%|██████████| 6/6 [00:00<00:00,  9.70it/s]


In [61]:
str(response)

'The models developed in this work generally perform better than existing open-source models based on the helpfulness and safety benchmarks tested. They also appear to be on par with some of the closed-source models, at least according to the human evaluations conducted.'