In [5]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

mkdir: data: File exists
--2025-04-25 18:16:33--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.131.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2025-04-25 18:16:33--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’


2025-04-25 18:16:35 (9.45 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]



In [7]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

In [8]:
from dotenv import load_dotenv

load_dotenv()

True

In [9]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model="text-embedding-3-small", embed_batch_size=256)

In [10]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter], emded_model=embed_model)


## Step1 : Query generation/ rewriting

The first step is to generate queries from the original query to better match the query intent, and increase precision/recall of the retrieved results. For instance, we might be able to rewrite the query into smaller queries.

We can do this by prompting ChatGPT.

In [11]:
from llama_index.core import PromptTemplate


In [12]:
query = "How do the models developed in this work compare to open-source chat models based on the benchmarks tested?"

In [13]:
queryGenPromptStr = (
    "You are a helpful assistant that generates multiple search queries based on a "
    "single input query. Generate {num_queries} search queries, one on each line, "
    "related to the following input query:\n"
    "Query: {query}\n"
    "Queries:\n"
)
QueryGenPrompt = PromptTemplate(queryGenPromptStr)

In [14]:
QueryGenPrompt

PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['num_queries', 'query'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='You are a helpful assistant that generates multiple search queries based on a single input query. Generate {num_queries} search queries, one on each line, related to the following input query:\nQuery: {query}\nQueries:\n')

In [15]:
def generateQueries(llm, query, numQueries=4):
    queryGenPrompt = queryGenPromptStr.format(num_queries=numQueries - 1, query=query)
    response = llm.complete(queryGenPrompt)
    queries = response.text.split("\n")
    return queries

In [16]:
queries = generateQueries(llm, query, numQueries=4)

In [17]:
queries

['1. Comparison of models developed in this work to open-source chat models in terms of benchmark performance',
 '2. Evaluation of open-source chat models against models developed in this work using benchmark tests',
 '3. Analysis of differences between models developed in this work and open-source chat models in benchmark assessments']

### step 2: Perform vector search for each query 
Now we run retrieval for each query. This means that we fetch the top-k most relevant results from each vector store.

NOTE: We can also have multiple retrievers. Then the total number of queries we run is NM, where N is number of retrievers and M is number of generated queries. Hence there will also be NM retrieved lists.

In [18]:
from tqdm.asyncio import tqdm

async def runQuery(queries, retrivers):

    tasks = []
    for query in queries:
        for i, retriever in enumerate(retrivers):
            print(i, "retriver count")
            tasks.append(retriever.aretrieve(query))
    print(tasks)
    taskResults = await tqdm.gather(*tasks)
    resultsDict = {}
    for i, (query, queryResult) in enumerate(zip(queries, taskResults)):
        resultsDict[(query, i)] = queryResult

    return resultsDict

In [19]:
from llama_index.retrievers.bm25 import BM25Retriever

vector_retriever = index.as_retriever(similarity_top_k=2)

bm25_retriever = BM25Retriever.from_defaults(docstore=index.docstore, similarity_top_k=2)

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
results_dict = await runQuery(queries, [vector_retriever, bm25_retriever])

0 retriver count
1 retriver count
0 retriver count
1 retriver count
0 retriver count
1 retriver count
[<coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a25230>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a252a0>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a25310>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a25380>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a253f0>, <coroutine object Dispatcher.span.<locals>.async_wrapper at 0x168a25460>]


100%|██████████| 6/6 [00:04<00:00,  1.30it/s]


In [21]:
results_dict.values()

dict_values([[NodeWithScore(node=TextNode(id_='492ad3a7-b1a9-44aa-b219-520131d21a6e', embedding=None, metadata={'total_pages': 77, 'file_path': './data/llama2.pdf', 'source': '3'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3a269569-d844-44c7-b81e-f67a8f500c5f', node_type='4', metadata={'total_pages': 77, 'file_path': './data/llama2.pdf', 'source': '3'}, hash='b21f6b597af7703d0e85ce9247c719566bbfd7e16fc4c0750511ef1d1f0dbe9b')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Figure 1: Helpfulness human evaluation results for Llama\n2-Chat compared to other open-source and closed-source\nmodels. Human raters compared model generations on ~4k\nprompts consisting of both single and multi-turn prompts.\nThe 95% confidence intervals for this evaluation are between\n1% and 2%. More details in Section 3.4.2. While reviewing\nthese results, it is important to note that human evaluati

###  Step 3: Perform fusion

--> combine results from all the retrievers into one 
--> remove duplicate that came out along the way
--> re rank the nodes based reciprocal rank fusion
--> Sumation of 1/ (k+r)
--> Reorder nodes by highest to least

In [22]:
from typing import List
from llama_index.core.schema import NodeWithScore

def fuseResults(resultsDict, similarity_top_k=2):
    
    k = 60.0
    fusedScores = {}
    textToNode = {}

    # compute the reciprocal rand scores
    for nodesWithScore in resultsDict.values():
        for rank, nodesWithScore in enumerate(
            sorted(
                nodesWithScore, key=lambda x: x.score or 0.0, reverse=True
            )
        ):
            text = nodesWithScore.node.get_content()
            textToNode[text] = nodesWithScore.node
            if text not in fusedScores:
                fusedScores[text] = 0.0
            fusedScores[text] += 1.0 / (rank + k)
    
    # sort results
    rerankedResults = dict(sorted(fusedScores.items(), key=lambda x:x[1], reverse=True))

    rerankedNodes : List[NodeWithScore] = []
    for text, score in rerankedResults.items():
        rerankedNodes.append(NodeWithScore(node = textToNode[text], score=score))
        # print(rerankedNodes[-1])

    return rerankedNodes[:similarity_top_k]


In [23]:
finalResults = fuseResults(results_dict)

In [26]:

 for n in finalResults:
    print(n.score, "\n", n.text, "\n********\n")

0.04972677595628415 
 Figure 1: Helpfulness human evaluation results for Llama
2-Chat compared to other open-source and closed-source
models. Human raters compared model generations on ~4k
prompts consisting of both single and multi-turn prompts.
The 95% confidence intervals for this evaluation are between
1% and 2%. More details in Section 3.4.2. While reviewing
these results, it is important to note that human evaluations
can be noisy due to limitations of the prompt set, subjectivity
of the review guidelines, subjectivity of individual raters,
and the inherent difficulty of comparing generations.
Figure 2: Win-rate % for helpfulness and
safety between commercial-licensed base-
lines and Llama 2-Chat, according to GPT-
4. To complement the human evaluation, we
used a more capable model, not subject to
our own guidance. Green area indicates our
model is better according to GPT-4. To remove
ties, we used win/(win + loss). The orders in
which the model responses are presented to
GPT-4 a