In [93]:
import nest_asyncio

nest_asyncio.apply()

# <font color='blue'> Hybrid Retriever with Re rank Model </font>

### `Setting up OpenAI`

In [94]:
import os
import openai
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
load_dotenv('myenv/.env')
a=os.environ.get('OPENAI_API_KEY')
openai.api_key=a

In [95]:
os.environ["OPENAI_API_KEY"] = a
openai.api_key = os.environ["OPENAI_API_KEY"]

### `Importing the necessary libraries`

In [96]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

In [97]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from llama_index.core.node_parser import LangchainNodeParser

# parser = LangchainNodeParser(RecursiveCharacterTextSplitter())
# nodes = parser.get_nodes_from_documents(documents)

In [98]:

# load documents
documents = SimpleDirectoryReader("examples_files").load_data()


# from llama_index.core.node_parser import SentenceSplitter

# splitter = SentenceSplitter(
#     chunk_size=524,
#     chunk_overlap=20,
# )
# nodes = splitter.get_nodes_from_documents(documents)
# print(nodes[0].text)

# len(nodes[0].text)

In [99]:
# initialize LLM + node parser
llm = OpenAI(model="gpt-4")
splitter = SentenceSplitter(chunk_size=1024)

nodes = splitter.get_nodes_from_documents(documents)

In [100]:

# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [101]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
)

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


## <font color='clue'> BM25 Retriever </font>

In [102]:
# We can pass in the index, doctore, or list of nodes to create the retriever
retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)

In [103]:
from llama_index.core.response.notebook_utils import display_source_node

# will retrieve context from specific companies
nodes = retriever.retrieve("What happened at Viaweb and Interleaf?")
for node in nodes:
    display_source_node(node)

**Node ID:** 76cbcc3f-3a7b-4d74-83c3-867ede6292b3<br>**Similarity:** 1.2070521577440287<br>**Text:** All that seemed left for philosophy were edge cases that people in other fields felt could safely...<br>

**Node ID:** c67175fe-1b95-4b46-b4b8-bffb8c5779f1<br>**Similarity:** 1.1590197725657112<br>**Text:** In the summer of 2016 we moved to England. We wanted our kids to see what it was like living in a...<br>

**Node ID:** 86c38d61-87d0-41dd-813f-51e7a4bc435c<br>**Similarity:** 1.0520373941953076<br>**Text:** Its brokenness did, as so often happens, generate a lot of opportunities to write papers about va...<br>

In [104]:

nodes = retriever.retrieve("What did Paul Graham do after RISD?")
for node in nodes:
    display_source_node(node)

**Node ID:** 5c052e9c-1200-497d-9c67-f54339cfda92<br>**Similarity:** 5.376672394183613<br>**Text:** The Lisp that John McCarthy invented, or more accurately discovered, is an answer to that questio...<br>

**Node ID:** 319cdb00-b0a0-495d-8f3c-a695decceb8d<br>**Similarity:** 1.1486157415517497<br>**Text:** But alas it was more like the Accademia than not. Better organized, certainly, and a lot more exp...<br>

**Node ID:** 5adabc0d-230c-4cfd-b935-7002a81672d0<br>**Similarity:** 1.1466504294464328<br>**Text:** The students and faculty in the painting department at the Accademia were the nicest people you c...<br>

## <font color='clue'>Hybrid Retriever </font>

#### `Now we will combine bm25 retriever with vector index retriever.`

In [105]:
from llama_index.core.tools import RetrieverTool

vector_retriever = VectorIndexRetriever(index)
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=bm25_retriever,
        description="Useful if searching about specific information",
    ),
]

In [106]:
from llama_index.core.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    llm=llm,
    select_multi=True,
)

In [107]:
# will retrieve all context from the author's life
nodes = retriever.retrieve(
    "Can you give me all the context regarding the author's life?"
)
for node in nodes:
    display_source_node(node)

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Selecting retriever 0: This choice seems to be the most comprehensive and would likely include information about the author's life..
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


**Node ID:** 4243ceea-498d-492f-ad31-37686e0f3c21<br>**Similarity:** 0.7896572255830281<br>**Text:** If he even knew about the strange classes I was taking, he never said anything.

So now I was in ...<br>

**Node ID:** 5adabc0d-230c-4cfd-b935-7002a81672d0<br>**Similarity:** 0.7886701355544645<br>**Text:** The students and faculty in the painting department at the Accademia were the nicest people you c...<br>

## <font color='clue'>Advanced - Hybrid Retriever + Re-Ranking </font>

In [109]:
# !curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf

#### `Setting up the data`

In [None]:
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader,
    Document,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

# load documents
documents = SimpleDirectoryReader(
    input_files=["IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [None]:
llm = OpenAI(model="gpt-3.5-turbo")
splitter = SentenceSplitter(chunk_size=256)
# limit to a smaller section
nodes = splitter.get_nodes_from_documents(
    [Document(text=documents[0].get_content()[:1000000])]
)

In [None]:
# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [None]:
index = VectorStoreIndex(nodes, storage_context=storage_context)


HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [None]:
from llama_index.retrievers.bm25 import BM25Retriever

# retireve the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=10)

# retireve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)

#### `Custom Retriever Implementation`

In [None]:
from llama_index.core.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes

In [None]:
index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

### `Re-Ranker Setup`

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")

### `Retrieve`

In [110]:
from llama_index.core import QueryBundle

retrieved_nodes = hybrid_retriever.retrieve(
    "What is the impact of climate change on the ocean?"
)
reranked_nodes = reranker.postprocess_nodes(
    retrieved_nodes,
    query_bundle=QueryBundle(
        "What is the impact of climate change on the ocean?"
    ),
)

print("Initial retrieval: ", len(retrieved_nodes), " nodes")
print("Re-ranked retrieval: ", len(reranked_nodes), " nodes")

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]


Initial retrieval:  20  nodes
Re-ranked retrieval:  4  nodes


In [113]:
for node in retrieved_nodes:
    display_source_node(node)

**Node ID:** 8ccb2a04-bd30-4008-97be-fd83b2e74875<br>**Similarity:** 3.733758785529062e-05<br>**Text:** Our model turned out to live just down the street from me. She made a living from a combination o...<br>

**Node ID:** 91ebf0d3-7d79-4146-94cc-40ee6d3cb88f<br>**Similarity:** 4.252616054145619e-05<br>**Text:** Plus it wasn't startup founders we wanted to reach. It was future startup founders. So I changed ...<br>

**Node ID:** 2fd6f09b-f9dd-4811-9128-bc8f3df085b7<br>**Similarity:** 3.734994606929831e-05<br>**Text:** He meant it both descriptively and prescriptively, and it was the second part that scared me. I w...<br>

**Node ID:** 849f57b7-7acc-46ef-96fd-1157e48c210b<br>**Similarity:** 9.689940634416416e-05<br>**Text:** We talked to Robert and Trevor and we agreed to make it a complete changing of the guard. Up till...<br>

**Node ID:** 38fadb12-3df9-46ce-8899-01b8f871e4fc<br>**Similarity:** 3.729146919795312e-05<br>**Text:** In principle our Viaweb stock was valuable. It was a share in a business that was profitable and ...<br>

**Node ID:** ed14fb68-3aaf-4850-b064-0400146cdec0<br>**Similarity:** 3.72881258954294e-05<br>**Text:** And as long as every change you made to McCarthy's Lisp was a discoveredness-preserving transform...<br>

**Node ID:** 10592435-cc50-472e-b6a1-00d9b5c3cef7<br>**Similarity:** 3.734289202839136e-05<br>**Text:** What I Worked On

February 2021

Before college the two main things I worked on, outside of schoo...<br>

**Node ID:** afec22a8-3680-4b6f-a3b7-f9b1160c7fff<br>**Similarity:** 0.0027530642691999674<br>**Text:** It comes with the territory. An essay must tell readers things they don't already know, and some ...<br>

**Node ID:** cfde4a65-0b3e-433a-8bef-4fb24d922ea8<br>**Similarity:** 3.734887650352903e-05<br>**Text:** [1]

The first of my friends to get a microcomputer built it himself. It was sold as a kit by Hea...<br>

**Node ID:** f00ecbf7-788c-4f3e-b080-21624c0820cb<br>**Similarity:** 3.732202821993269e-05<br>**Text:** It seemed only a matter of time before we'd have Mike, and when I saw Winograd using SHRDLU, it s...<br>

**Node ID:** e039283d-844d-431d-968e-ee652584ce5b<br>**Similarity:** 0.0009219381026923656<br>**Text:** Ghebrehiwet, S.-I.  Ito, W.  Kiessling, P .  Martinetto, E.  Ojea, 
M.-F . Racault, B.  Rost, and...<br>

**Node ID:** 4246f41d-f1a1-4619-895c-c593912048ce<br>**Similarity:** 0.0006790432380512357<br>**Text:** SPM379
3
Oceans and Coastal 
Ecosystems and Their Services
This chapter should be cited as:
Coole...<br>

**Node ID:** 16793449-be44-43b2-880b-b35e3caf6611<br>**Similarity:** 0.0006047335918992758<br>**Text:** In: Climate 
Change 2022: Impacts, Adaptation and Vulnerability. Contribution of Working Group II...<br>

**Node ID:** b92700a4-11a1-4638-8292-0de0b4c49f44<br>**Similarity:** 0.0001834242866607383<br>**Text:** Möller, A.  Okem, B.  Rama (eds.)]. Cambridge University Press, Cambridge, 
UK and New York, NY ,...<br>

**Node ID:** 67579e82-66b4-4339-8226-7a5d8afc4593<br>**Similarity:** 7.797543366905302e-05<br>**Text:** Yunus Mgaya (Tanzania), Coleen Moloney (South Africa), Aditi Mukherji (Nepal), Norma 
Patricia Mu...<br>

**Node ID:** 8f0c5213-a416-42cc-9a8b-e3b9e0b9521e<br>**Similarity:** 0.00039533976814709604<br>**Text:** Mette Skern-Mauritzen (Norway), Dawit 
Yemane Ghebrehiwet (South Africa/Eritrea)
Contributing Aut...<br>

**Node ID:** 3af53979-0fec-4893-b26c-cdb73a99c2a4<br>**Similarity:** 0.00029073379118926823<br>**Text:** 005.Coordinating Lead Authors: Sarah R. Cooley (USA) and David S. Schoeman (Australia)
Lead Autho...<br>

**Node ID:** b925ad8f-a28c-4ac8-836a-3575be252ff0<br>**Similarity:** 6.435224349843338e-05<br>**Text:** Stephanie Dutkiewicz (USA), Thomas Frölicher 
(Switzerland), Juan Diego Gaitán-Espitia (Hong Kong...<br>

**Node ID:** cafa7fc1-4b9e-478e-810a-064f83d3c3f4<br>**Similarity:** 0.0003428798518143594<br>**Text:** Pörtner, D.C.  Roberts, M.  Tignor, E.S.  Poloczanska, K.  Mintenbeck, 
A. Alegría, M.  Craig, S....<br>

**Node ID:** 0f965b9d-4c8d-4927-a1a0-13a1a306c0ae<br>**Similarity:** 0.00010701524297473952<br>**Text:** Shin-
Ichi Ito (Japan), Wolfgang Kiessling (Germany), Paulina Martinetto (Argentina), Elena Ojea ...<br>

In [112]:
for node in reranked_nodes:
    display_source_node(node)

**Node ID:** afec22a8-3680-4b6f-a3b7-f9b1160c7fff<br>**Similarity:** 0.0027530642691999674<br>**Text:** It comes with the territory. An essay must tell readers things they don't already know, and some ...<br>

**Node ID:** e039283d-844d-431d-968e-ee652584ce5b<br>**Similarity:** 0.0009219381026923656<br>**Text:** Ghebrehiwet, S.-I.  Ito, W.  Kiessling, P .  Martinetto, E.  Ojea, 
M.-F . Racault, B.  Rost, and...<br>

**Node ID:** 4246f41d-f1a1-4619-895c-c593912048ce<br>**Similarity:** 0.0006790432380512357<br>**Text:** SPM379
3
Oceans and Coastal 
Ecosystems and Their Services
This chapter should be cited as:
Coole...<br>

**Node ID:** 16793449-be44-43b2-880b-b35e3caf6611<br>**Similarity:** 0.0006047335918992758<br>**Text:** In: Climate 
Change 2022: Impacts, Adaptation and Vulnerability. Contribution of Working Group II...<br>

In [115]:
for node in reranked_nodes:
    print(node.score)

0.0027530643
0.0009219381
0.00067904324
0.0006047336
