In [None]:
# https://docs.llamaindex.ai/en/stable/examples/query_engine/pdf_tables/recursive_retriever/
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [6]:
import camelot

from llama_index.core import VectorStoreIndex
from llama_index.experimental.query_engine import PandasQueryEngine
from llama_index.core.schema import IndexNode
from llama_index.llms.google_genai import GoogleGenAI

from llama_index.readers.file import PyMuPDFReader
from typing import List

In [7]:
from llama_index.core import Settings

google_embedding = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
embed_model = LangchainEmbedding(google_embedding)

Settings.llm = GoogleGenAI('gemini-2.0-flash')
Settings.embed_model = embed_model

INFO:httpx:HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash "HTTP/1.1 200 OK"
HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash "HTTP/1.1 200 OK"


In [8]:
file = 'billionaires.pdf'

reader = PyMuPDFReader()
docs = reader.load_data(file)

In [16]:
def get_tables(path: str, pages: List[int]):
    table_dfs = []

    for page in pages:
        table_list = camelot.read_pdf(path, pages = str(page))
        table_df = table_list[0].df
        table_df = (
            table_df.rename(columns = table_df.iloc[0])
            .drop(table_df.index[0])
            .reset_index(drop=True)
        )
        table_dfs.append(table_df)
    
    return table_dfs

In [25]:
table_dfs = get_tables(file, [3, 34])

Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)


In [18]:
table_dfs[0]

Unnamed: 0,No.,Name,Net worth\n(USD),Age,Nationality,Primary source(s) of\nwealth
0,1,Elon Musk,$342 billion,53,South Africa\n Canada\n United\nStates,"Tesla, SpaceX"
1,2,Mark Zuckerberg,$216 billion,40,United\nStates,Meta Platforms
2,3,Jeff Bezos,$215 billion,61,United\nStates,Amazon
3,4,Larry Ellison,$192 billion,80,United\nStates,Oracle Corporation
4,5,Bernard Arnault &\nfamily,$178 billion,76,France,LVMH
5,6,Warren Buffett,$154 billion,94,United\nStates,Berkshire Hathaway
6,7,Larry Page,$144 billion,52,United\nStates,Google
7,8,Sergey Brin,$138 billion,51,United\nStates,Google
8,9,Amancio Ortega,$124 billion,89,Spain,"Inditex, Zara"
9,10,Steve Ballmer,$118 billion,69,United\nStates,Microsoft


In [26]:
table_dfs[1]

Unnamed: 0,Year,Number of billionaires,Group's combined net worth
0,2025[2],3028.0,$16.1 trillion
1,2024[2],2781.0,$14.2 trillion
2,2023[8],2640.0,$12.2 trillion
3,2022[7],2668.0,$12.7 trillion
4,2021[13],2755.0,$13.1 trillion
5,2020,2095.0,$8.0 trillion
6,2019,2153.0,$8.7 trillion
7,2018,2208.0,$9.1 trillion
8,2017,2043.0,$7.7 trillion
9,2016,1810.0,$6.5 trillion


In [27]:
llm = GoogleGenAI(model='gemini-2.0-flash')

df_query_engines = [
    PandasQueryEngine(table, llm = llm) for table in table_dfs
]

INFO:httpx:HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash "HTTP/1.1 200 OK"
HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash "HTTP/1.1 200 OK"


In [22]:
response = df_query_engines[0].query(
    "What's the net worth of the man from Omaha?"
)
print(str(response))

INFO:google_genai.models:AFC is enabled with max remote calls: 10.
AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
AFC remote call 1 is done.
$154 billion


In [28]:
response = df_query_engines[1].query(
    "How many billionaires were there in 2009?"
)
print(str(response))

INFO:google_genai.models:AFC is enabled with max remote calls: 10.
AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
AFC remote call 1 is done.
793


In [29]:
doc_nodes = Settings.node_parser.get_nodes_from_documents(docs)

In [30]:
summaries = [
    (
        "This node provides information about the world's richest billionaires"
        " in 2023"
    ),
    (
        "This node provides information on the number of billionaires and"
        " their combined net worth from 2000 to 2023."
    ),
]
df_nodes = [
    IndexNode(
        text = summary,
        index_id = f"pandas{i}"
    )
    for i, summary in enumerate(summaries)

]
df_id_query_engine_mapping = {
    f"pandas{i}" : df_query_engines[i]
    for i in range(len(df_query_engines))
}

In [31]:
vector_index = VectorStoreIndex(doc_nodes + df_nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k = 1)

In [32]:
vector_index0 = VectorStoreIndex(doc_nodes)
vector_query_engine0 = vector_index0.as_query_engine()

In [33]:
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

In [34]:
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict= {
        "vector": vector_retriever
    },
    query_engine_dict=df_id_query_engine_mapping,
    verbose = True,
)

response_synthesizer = get_response_synthesizer(response_mode = "compact")

query_engine = RetrieverQueryEngine(
    recursive_retriever,
    response_synthesizer = response_synthesizer,
)

In [35]:
response = query_engine.query(
    "What's the net worth of 2025 billionaires?"
)

[1;3;34mRetrieving with query id None: What's the net worth of 2025 billionaires?
[0m[1;3;38;5;200mRetrieving text node: 6/16/25, 12:50
The World's Billionaires - Wikipedia
Page 34 of 43
https://en.wikipedia.org/wiki/The_World%27s_Billionaires
Number and combined net worth of billionaires by year[68]
Year
Number of billionaires
Group's combined net worth
2025[2]
3,028
$16.1 trillion
2024[2]
2,781
$14.2 trillion
2023[8]
2,640
$12.2 trillion
2022[7]
2,668
$12.7 trillion
2021[13]
2,755
$13.1 trillion
2020
2,095
$8.0 trillion
2019
2,153
$8.7 trillion
2018
2,208
$9.1 trillion
2017
2,043
$7.7 trillion
2016
1,810
$6.5 trillion
2015[20]
1,826
$7.1 trillion
2014[69]
1,645
$6.4 trillion
2013[70]
1,426
$5.4 trillion
2012
1,226
$4.6 trillion
2011
1,210
$4.5 trillion
2010
1,011
$3.6 trillion
2009
793
$2.4 trillion
2008
1,125
$4.4 trillion
2007
946
$3.5 trillion
2006
793
$2.6 trillion
2005
691
$2.2 trillion
2004
587
$1.9 trillion
2003
476
$1.4 trillion
2002
497
$1.5 trillion
2001
538
$1.8 trillio

In [36]:
response.source_nodes[0].node.get_content()

"6/16/25, 12:50\nThe World's Billionaires - Wikipedia\nPage 34 of 43\nhttps://en.wikipedia.org/wiki/The_World%27s_Billionaires\nNumber and combined net worth of billionaires by year[68]\nYear\nNumber of billionaires\nGroup's combined net worth\n2025[2]\n3,028\n$16.1 trillion\n2024[2]\n2,781\n$14.2 trillion\n2023[8]\n2,640\n$12.2 trillion\n2022[7]\n2,668\n$12.7 trillion\n2021[13]\n2,755\n$13.1 trillion\n2020\n2,095\n$8.0 trillion\n2019\n2,153\n$8.7 trillion\n2018\n2,208\n$9.1 trillion\n2017\n2,043\n$7.7 trillion\n2016\n1,810\n$6.5 trillion\n2015[20]\n1,826\n$7.1 trillion\n2014[69]\n1,645\n$6.4 trillion\n2013[70]\n1,426\n$5.4 trillion\n2012\n1,226\n$4.6 trillion\n2011\n1,210\n$4.5 trillion\n2010\n1,011\n$3.6 trillion\n2009\n793\n$2.4 trillion\n2008\n1,125\n$4.4 trillion\n2007\n946\n$3.5 trillion\n2006\n793\n$2.6 trillion\n2005\n691\n$2.2 trillion\n2004\n587\n$1.9 trillion\n2003\n476\n$1.4 trillion\n2002\n497\n$1.5 trillion\n2001\n538\n$1.8 trillion\n2000\n470\n$898 billion\nSources: Forb

In [37]:
str(response)

'The combined net worth of billionaires in 2025 is $16.1 trillion.\n'