# Recursive Retriever + Query Engine

- The concept of recursive retrieval is that we not only explore the directly most relevant nodes, but also explore `node relationships to additional retrievers/query engines and execute them`.
- For instance, a node may represent a concise summary of a structured table, and link to a SQL/Pandas query engine over that structured table. Then if the node is retrieved, we want to also query the underlying query engine for the answer.

# When it is useful ?

 - documents with hierarchical relationships
 - documents contains both `text and variety of embedded structured tables` as an example

In [33]:
from typing import List
import yaml, os, camelot
from pathlib import Path
from llama_index.schema import IndexNode
from llama_index import Document, SummaryIndex
from llama_index.llms import AzureOpenAI, OpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.retrievers import RecursiveRetriever
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.text_splitter import TokenTextSplitter
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine
from llama_index import ServiceContext, load_index_from_storage, StorageContext, VectorStoreIndex

In [2]:
# !wget "https://www.dropbox.com/scl/fi/waoz9bo9yiemnhnqvu0cc/billionaires_page.pdf?rlkey=4i08msa7zr1lpnuq2y1vs2xgw&dl=1" -O data/billionaires_page.pdf

# Configure LLMs

In [3]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

In [4]:
llm_flag = 'DIRECT'

embedding_llm = HuggingFaceEmbedding(
                                    model_name="BAAI/bge-large-en-v1.5",
                                    device='mps'
                                    )

if llm_flag == 'AZURE':
    llm=AzureOpenAI(
                    model=credentials['AZURE_ENGINE'],
                    api_key=credentials['AZURE_OPENAI_API_KEY'],
                    deployment_name=credentials['AZURE_DEPLOYMENT_ID'],
                    api_version=credentials['AZURE_OPENAI_API_VERSION'],
                    azure_endpoint=credentials['AZURE_OPENAI_API_BASE'],
                    temperature=0.3
                    )
    
    chat_llm = LLMPredictor(llm)
else:
    chat_llm = OpenAI(
                    api_key=credentials['DEMO_OPENAI_API_KEY'],
                    temperature=0.3
                    )

text_splitter = TokenTextSplitter(
                                separator=" ",
                                chunk_size=1024,
                                chunk_overlap=20,
                                backup_separators=["\n"]
                                )

if llm_flag == 'AZURE':
    service_context = ServiceContext.from_defaults(
                                                    text_splitter=text_splitter,
                                                    # prompt_helper=prompt_helper,
                                                    embed_model=embedding_llm,
                                                    llm_predictor=chat_llm
                                                    )
else:
    service_context = ServiceContext.from_defaults(
                                                    text_splitter=text_splitter,
                                                    # prompt_helper=prompt_helper,
                                                    embed_model=embedding_llm,
                                                    llm=chat_llm
                                                    )

set_global_service_context(service_context)

# Load Documents

In [15]:
data_path = './data/billionaires_page.pdf'

In [16]:
reader = PyMuPDFReader()
docs = reader.load(data_path)
len(docs)

33

In [14]:
def get_tables(
            path: str, 
            pages: List[int]
            ):
    table_dfs = []
    for page in pages:
        table_list = camelot.read_pdf(
                                    path, 
                                    pages=str(page)
                                    )
        table_df = table_list[0].df
        table_df = (
                    table_df.rename(columns=table_df.iloc[0])
                    .drop(table_df.index[0])
                    .reset_index(drop=True)
                    )
        table_dfs.append(table_df)
    return table_dfs

In [18]:
table_dfs = get_tables(
                        data_path, 
                        pages=[3, 25]
                        )
len(table_dfs)

2

In [19]:
table_dfs[0]

Unnamed: 0,No.,Name,Net worth\n(USD),Age,Nationality,Primary source(s) of wealth
0,1,Bernard Arnault &\nfamily,$211 billion,74,France,LVMH
1,2,Elon Musk,$180 billion,51,United\nStates,"Tesla, SpaceX, X Corp."
2,3,Jeff Bezos,$114 billion,59,United\nStates,Amazon
3,4,Larry Ellison,$107 billion,78,United\nStates,Oracle Corporation
4,5,Warren Buffett,$106 billion,92,United\nStates,Berkshire Hathaway
5,6,Bill Gates,$104 billion,67,United\nStates,Microsoft
6,7,Michael Bloomberg,$94.5 billion,81,United\nStates,Bloomberg L.P.
7,8,Carlos Slim & family,$93 billion,83,Mexico,"Telmex, América Móvil, Grupo\nCarso"
8,9,Mukesh Ambani,$83.4 billion,65,India,Reliance Industries
9,10,Steve Ballmer,$80.7 billion,67,United\nStates,Microsoft


In [20]:
table_dfs[1]

Unnamed: 0,Year,Number of billionaires,Group's combined net worth
0,2023[2],2640.0,$12.2 trillion
1,2022[6],2668.0,$12.7 trillion
2,2021[11],2755.0,$13.1 trillion
3,2020,2095.0,$8.0 trillion
4,2019,2153.0,$8.7 trillion
5,2018,2208.0,$9.1 trillion
6,2017,2043.0,$7.7 trillion
7,2016,1810.0,$6.5 trillion
8,2015[18],1826.0,$7.1 trillion
9,2014[67],1645.0,$6.4 trillion


# Create Pandas Query Engines

In [21]:
df_query_engines = [
                    PandasQueryEngine(
                                    table_df, 
                                    service_context=service_context
                                    )
                    for table_df in table_dfs
                    ]

#### sample querying

In [22]:
response = df_query_engines[0].query(
    "What's the net worth of the second richest billionaire in 2023?"
)
print(str(response))

$180 billion


In [23]:
response = df_query_engines[1].query(
    "How many billionaires were there in 2009?"
)
print(str(response))

14    793
Name: Number of billionaires, dtype: object


# Build Vector Index

In [25]:
len(docs)

33

In [27]:
doc_nodes = service_context.node_parser.get_nodes_from_documents(docs)
len(doc_nodes)

40

In [44]:
# define index nodes
summaries = [
            (
            "This node provides information about the world's richest billionaires"
            " in 2023"
            ),
            (
            "This node provides information on the number of billionaires and"
            " their combined net worth from 2000 to 2023."
            )
            ]

df_nodes = [
            IndexNode(text=summary, index_id=f"table {idx}")
            for idx, summary in enumerate(summaries)
            ]


df_id_query_engine_mapping = {
                            f"table {idx}": df_query_engine
                            for idx, df_query_engine in enumerate(df_query_engines)
                            }

In [45]:
vector_index = VectorStoreIndex(doc_nodes + df_nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)

# Build Query Engine

In [46]:
recursive_retriever = RecursiveRetriever(
                                        "vector",
                                        retriever_dict={"vector": vector_retriever},
                                        query_engine_dict=df_id_query_engine_mapping,
                                        verbose=True,
                                        )

response_synthesizer = get_response_synthesizer(response_mode="compact")

query_engine = RetrieverQueryEngine.from_args(
                                            recursive_retriever, 
                                            response_synthesizer=response_synthesizer
                                            )

# Querying

In [47]:
response = query_engine.query(
            "What's the net worth of the second richest billionaire in 2023?"
)

[1;3;34mRetrieving with query id None: What's the net worth of the second richest billionaire in 2023?
[0m[1;3;38;5;200mRetrieved node with id, entering: table 0
[0m[1;3;34mRetrieving with query id table 0: What's the net worth of the second richest billionaire in 2023?
[0m

[1;3;32mGot response: $180 billion
[0m

In [48]:
print(response.source_nodes[0])

Node ID: a2ad14ec-31a6-4813-a828-2ea95b24dad1
Text: Query: What's the net worth of the second richest billionaire in
2023? Response: $180 billion
Score:  0.761

