In [21]:
import nest_asyncio

nest_asyncio.apply()

# <font color='blue'> Hybrid Retriever with Re rank Model </font>

### `Setting up OpenAI`

In [22]:
import os
import openai
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
load_dotenv('myenv/.env')
a=os.environ.get('OPENAI_API_KEY')
openai.api_key=a

In [23]:
os.environ["OPENAI_API_KEY"] = a
openai.api_key = os.environ["OPENAI_API_KEY"]

### `Importing the necessary libraries`

In [24]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

In [25]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from llama_index.core.node_parser import LangchainNodeParser

# parser = LangchainNodeParser(RecursiveCharacterTextSplitter())
# nodes = parser.get_nodes_from_documents(documents)

In [26]:

# load documents
documents = SimpleDirectoryReader("examples_files").load_data()


# from llama_index.core.node_parser import SentenceSplitter

# splitter = SentenceSplitter(
#     chunk_size=524,
#     chunk_overlap=20,
# )
# nodes = splitter.get_nodes_from_documents(documents)
# print(nodes[0].text)

# len(nodes[0].text)

In [27]:
# initialize LLM + node parser
llm = OpenAI(model="gpt-4")
splitter = SentenceSplitter(chunk_size=1024)

nodes = splitter.get_nodes_from_documents(documents)

In [28]:

# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [29]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
)

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


## <font color='clue'> BM25 Retriever </font>

In [30]:
# We can pass in the index, doctore, or list of nodes to create the retriever
retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)

In [31]:
from llama_index.core.response.notebook_utils import display_source_node

# will retrieve context from specific companies
nodes = retriever.retrieve("What happened at Viaweb and Interleaf?")
for node in nodes:
    display_source_node(node)

**Node ID:** c348f024-d3da-406b-9b8b-b1eae569e906<br>**Similarity:** 1.2070521577440287<br>**Text:** All that seemed left for philosophy were edge cases that people in other fields felt could safely...<br>

**Node ID:** 3e0c2424-1021-432b-9420-449f837e9637<br>**Similarity:** 1.1590197725657112<br>**Text:** In the summer of 2016 we moved to England. We wanted our kids to see what it was like living in a...<br>

**Node ID:** 694eb866-b261-4132-ae12-49ca92d34ac0<br>**Similarity:** 1.0520373941953076<br>**Text:** Its brokenness did, as so often happens, generate a lot of opportunities to write papers about va...<br>

In [32]:

nodes = retriever.retrieve("What did Paul Graham do after RISD?")
for node in nodes:
    display_source_node(node)

**Node ID:** 60236d8e-c90b-4baa-99e0-37af1e3b267d<br>**Similarity:** 5.376672394183613<br>**Text:** The Lisp that John McCarthy invented, or more accurately discovered, is an answer to that questio...<br>

**Node ID:** b97f1c55-000d-461f-bb6c-f149d7541965<br>**Similarity:** 1.1486157415517497<br>**Text:** But alas it was more like the Accademia than not. Better organized, certainly, and a lot more exp...<br>

**Node ID:** 52cb323a-13e0-423d-9281-66d884f06d4e<br>**Similarity:** 1.1466504294464328<br>**Text:** The students and faculty in the painting department at the Accademia were the nicest people you c...<br>

## <font color='clue'>Hybrid Retriever </font>

#### `Now we will combine bm25 retriever with vector index retriever.`

In [33]:
from llama_index.core.tools import RetrieverTool

vector_retriever = VectorIndexRetriever(index)
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=bm25_retriever,
        description="Useful if searching about specific information",
    ),
]

In [34]:
from llama_index.core.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    llm=llm,
    select_multi=True,
)

In [35]:
# will retrieve all context from the author's life
nodes = retriever.retrieve(
    "Can you give me all the context regarding the author's life?"
)
for node in nodes:
    display_source_node(node)

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Selecting retriever 0: This choice is relevant as it suggests that the information provided will be useful in most cases, which could include providing context about the author's life..
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


**Node ID:** 4a1a2bd0-f6e0-4cd8-9ba1-d9af197d98af<br>**Similarity:** 0.7897447800334005<br>**Text:** If he even knew about the strange classes I was taking, he never said anything.

So now I was in ...<br>

**Node ID:** 52cb323a-13e0-423d-9281-66d884f06d4e<br>**Similarity:** 0.788768431441921<br>**Text:** The students and faculty in the painting department at the Accademia were the nicest people you c...<br>

## <font color='clue'>Advanced - Hybrid Retriever + Re-Ranking </font>

In [36]:
#  !curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf

#### `Setting up the data`

In [37]:
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader,
    Document,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

# load documents
documents = SimpleDirectoryReader(
    input_files=["IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [38]:
llm = OpenAI(model="gpt-3.5-turbo")
splitter = SentenceSplitter(chunk_size=256)
# limit to a smaller section
nodes = splitter.get_nodes_from_documents(
    [Document(text=documents[0].get_content()[:1000000])]
)

In [39]:
# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [40]:
index = VectorStoreIndex(nodes, storage_context=storage_context)


HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [41]:
from llama_index.retrievers.bm25 import BM25Retriever

# retireve the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=10)

# retireve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)

#### `Custom Retriever Implementation`

In [42]:
from llama_index.core.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes

In [43]:
index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

### `Re-Ranker Setup`

In [44]:
from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")

### `Retrieve`

In [45]:
from llama_index.core import QueryBundle

retrieved_nodes = hybrid_retriever.retrieve(
    "What is the impact of climate change on the ocean?"
)
reranked_nodes = reranker.postprocess_nodes(
    retrieved_nodes,
    query_bundle=QueryBundle(
        "What is the impact of climate change on the ocean?"
    ),
)

print("Initial retrieval: ", len(retrieved_nodes), " nodes")
print("Re-ranked retrieval: ", len(reranked_nodes), " nodes")

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Initial retrieval:  11  nodes
Re-ranked retrieval:  4  nodes


In [46]:
for node in retrieved_nodes:
    display_source_node(node)

**Node ID:** 44f4d21b-edd2-467c-b7a5-cd4df6c50a86<br>**Similarity:** 0.0006790422485210001<br>**Text:** SPM379
3
Oceans and Coastal 
Ecosystems and Their Services
This chapter should be cited as:
Coole...<br>

**Node ID:** 611740ad-2a56-4666-a8cf-5d9e06d27d2c<br>**Similarity:** 0.0009219407220371068<br>**Text:** Ghebrehiwet, S.-I.  Ito, W.  Kiessling, P .  Martinetto, E.  Ojea, 
M.-F . Racault, B.  Rost, and...<br>

**Node ID:** a68f7645-8aec-4d31-a03d-9d59922d79c9<br>**Similarity:** 0.0006047315546311438<br>**Text:** In: Climate 
Change 2022: Impacts, Adaptation and Vulnerability. Contribution of Working Group II...<br>

**Node ID:** 557692f4-b233-47a7-aa61-6f36d268ec48<br>**Similarity:** 0.0003428792115300894<br>**Text:** Pörtner, D.C.  Roberts, M.  Tignor, E.S.  Poloczanska, K.  Mintenbeck, 
A. Alegría, M.  Craig, S....<br>

**Node ID:** d49e8765-8522-4609-8e25-0dc6dea6c9cd<br>**Similarity:** 0.0001834248105296865<br>**Text:** Möller, A.  Okem, B.  Rama (eds.)]. Cambridge University Press, Cambridge, 
UK and New York, NY ,...<br>

**Node ID:** d2402b65-882e-431b-9d53-bded12363d4f<br>**Similarity:** 0.00029073350015096366<br>**Text:** 005.Coordinating Lead Authors: Sarah R. Cooley (USA) and David S. Schoeman (Australia)
Lead Autho...<br>

**Node ID:** 4badd802-0b23-4d45-8e8c-a54cea306101<br>**Similarity:** 0.00010701474093366414<br>**Text:** Shin-
Ichi Ito (Japan), Wolfgang Kiessling (Germany), Paulina Martinetto (Argentina), Elena Ojea ...<br>

**Node ID:** c4391308-9d2d-4664-ad35-95bfa3aa8bed<br>**Similarity:** 0.00039533976814709604<br>**Text:** Mette Skern-Mauritzen (Norway), Dawit 
Yemane Ghebrehiwet (South Africa/Eritrea)
Contributing Aut...<br>

**Node ID:** 086f849a-ea1b-4023-9618-fd8349682a2f<br>**Similarity:** 0.0003288036386948079<br>**Text:** Bell (Australia), Julia Blanchard 
(Australia), Jessica Bolin (Australia), William W. L. Cheung (...<br>

**Node ID:** 10f0a7ca-ca55-45a6-a76e-aaf64928cdb5<br>**Similarity:** 6.43523017060943e-05<br>**Text:** Stephanie Dutkiewicz (USA), Thomas Frölicher 
(Switzerland), Juan Diego Gaitán-Espitia (Hong Kong...<br>

**Node ID:** d83cc732-d229-40d4-a486-9414e705d809<br>**Similarity:** 7.797536090947688e-05<br>**Text:** Yunus Mgaya (Tanzania), Coleen Moloney (South Africa), Aditi Mukherji (Nepal), Norma 
Patricia Mu...<br>

In [47]:
for node in reranked_nodes:
    display_source_node(node)

**Node ID:** 611740ad-2a56-4666-a8cf-5d9e06d27d2c<br>**Similarity:** 0.0009219407220371068<br>**Text:** Ghebrehiwet, S.-I.  Ito, W.  Kiessling, P .  Martinetto, E.  Ojea, 
M.-F . Racault, B.  Rost, and...<br>

**Node ID:** 44f4d21b-edd2-467c-b7a5-cd4df6c50a86<br>**Similarity:** 0.0006790422485210001<br>**Text:** SPM379
3
Oceans and Coastal 
Ecosystems and Their Services
This chapter should be cited as:
Coole...<br>

**Node ID:** a68f7645-8aec-4d31-a03d-9d59922d79c9<br>**Similarity:** 0.0006047315546311438<br>**Text:** In: Climate 
Change 2022: Impacts, Adaptation and Vulnerability. Contribution of Working Group II...<br>

**Node ID:** c4391308-9d2d-4664-ad35-95bfa3aa8bed<br>**Similarity:** 0.00039533976814709604<br>**Text:** Mette Skern-Mauritzen (Norway), Dawit 
Yemane Ghebrehiwet (South Africa/Eritrea)
Contributing Aut...<br>

In [48]:
for node in reranked_nodes:
    print(node.score)

0.0009219407
0.00067904225
0.00060473155
0.00039533977


## `Code in 1 shot`

In [49]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    node_postprocessors=[reranker],
    llm=llm,
)

response = query_engine.query(
    "What is the impact of climate change on the ocean?"
)

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [50]:
from llama_index.core.response.notebook_utils import display_response

display_response(response)

**`Final Response:`** Climate change has significant impacts on the ocean, affecting marine ecosystems and their services. These impacts can include changes in ocean temperature, sea level rise, ocean acidification, and alterations in marine biodiversity. Such changes can disrupt marine food webs, coral reef health, and the distribution of marine species, ultimately impacting the overall health and functioning of ocean ecosystems.