## Tool Selection - PoC
Here, we will use the LLM to select the best tool out of a filtered set of (at most) 20 tables.

Let's get started.

In [1]:
from pathlib import Path
import os

root = Path().absolute().parents[1]
os.chdir(str(root))

In [2]:
from src.helpers.hybrid_retrieval import HybridRetrieval

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
retriever = HybridRetrieval(top_k_stage_1=200, top_k_stage_2=20)

In [4]:
# 4) Query
question = "does the fish purr like a cat?"

question = "What's the electricity generation mix in Ireland? for renewable and non-renewable energy?"
# question = "What is the share of renewable energy in Ireland?"
# question = "What's the breakup between renewable and non-renewable energy production in Ireland?"
question = "Ireland rural vs urban population mix"
# question = "What are ireland's top exports?"
# question = "What are Prodcom sales for skincare beauty and makeup products in 2023?"
# question = "Give me a breakup of Ireland's share of transportation sector."
# question = "Sold productions - quantity (kg) for beauty, makeup and skincare preparations in Ireland in 2023?"
question = "beauty makeup & skincare production in prodcom data for ireland in 2023"
# question = "PRODCOM production quantity."
# question = "What's the mining and quarrying production in Ireland?"
# question = "nano cellulose production in ireland"
# question = "give me pharmaceuticals production in ireland"
# question = "what are the different types of pharmaceuticals products produced in ireland"


response = retriever.search(query=question)
response

I0000 00:00:1755786910.045979 1337384 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
                                                     

Unnamed: 0,id,stage_1_score,stage_2_score
0,CPM13,8.789187,0.460212
1,PCA23,4.585402,0.328005
2,IAIP13,7.194671,0.396397
3,CPM16,7.114656,0.444113
4,CPM18,7.114656,0.436573
...,...,...,...
71,NSA88,4.598532,0.422409
72,NSA97,4.598532,0.429908
73,MIP26,4.596580,0.407813
74,MIP10,4.577620,0.408266


In [5]:
top_20_table_ids = response.sort_values(by="stage_2_score", ascending=True)[:20]["id"].tolist()

In [6]:
top_20_table_ids

['PCA23',
 'IPEADS15',
 'ICA234',
 'IPEADS02',
 'PCA02',
 'PCA09',
 'IAIP14',
 'PCA22',
 'IPEADS10',
 'PCA17',
 'PCA18',
 'PCA19',
 'PCA15',
 'PCA03',
 'TRA168',
 'TRA174',
 'TRA172',
 'MIP23',
 'PCA05',
 'TRA165']

In [9]:
retriever.vector_store.docstore.search("PCA23").model_dump()

{'id': 'PCA23',
 'metadata': {'id': 'PCA23',
  'description': 'This table provides Prodcom Sales data for 2023, including sales figures in Euro Thousand and Volume for various product codes.',
  'sample_questions': ['What were the Prodcom Sales in 2023?',
   'What is the value of Prodcom Sales 2023 (Volume) for product code 20147320?',
   'Show me the Euro Thousand values for Prodcom Sales 2023.',
   'Which products had the highest Prodcom Sales in 2023?',
   'What is the unit of measurement for Prodcom Sales 2023 (Volume)?',
   'Can you provide data on Prodcom Sales for the year 2023?',
   'How is the data on Prodcom Sales 2023 categorized?',
   'Where can I find the methodology for Prodcom Sales data?',
   'What is the meaning of the product code PCA23C01?',
   'What is the source of this Prodcom Sales data?'],
  'subject': 'Industry',
  'product': 'Prodcom Statistics',
  'table_name': 'Prodcom Sales 2023',
  'columns': ['Year', 'Product']},
 'page_content': 'This table provides Prod

### Table-Selection PoC

In [10]:
from langchain_google_genai import ChatGoogleGenerativeAI


llm_low = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-lite",
    temperature=1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
llm_med = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
llm_high  = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
def create_context(table_ids: list) -> str:
    context = []
    for table_id in table_ids:
        doc = retriever.vector_store.docstore.search(table_id)
        sample_questions_str = "\n  - ".join(doc.metadata["sample_questions"])
        text_chunk_list = [
            f"**Table ID**: {doc.id}",
            f"**Table Name (and Category)**: {doc.metadata["table_name"]} ({doc.metadata["subject"]}: {doc.metadata["product"]})",
            f"**Table Summary**: {doc.metadata["description"]}",
            f"**Fields**: {", ".join(doc.metadata["columns"])}",
            f"**Sample Questions**:",
            f"  - {sample_questions_str}"
        ]
        text_chunk = "\n".join(text_chunk_list)
        context.append(text_chunk)
    return "\n\n".join(context)

In [32]:
from pydantic import BaseModel, Field


class TableSelectionSubclass(BaseModel):
    table_id: str = Field(description="Table ID.")
    explanation: str = Field(description="Concise 1-liner explanation behind why this table is relevant.")

class TableSelection(BaseModel):
    relevant_tables: list[TableSelectionSubclass] = Field(description="List of relevant tables with explanations.")


In [45]:
from textwrap import dedent

question = "beauty and makeup production data for india"

context = create_context(top_20_table_ids)
prompt = dedent(
    f"""\
    Given the following tables context, select up to 3 of the possible relevant tables based on the question asked.

    Table context:
    {context}

    question: {question}
    """
)


res = llm_med.with_structured_output(TableSelection).invoke(prompt)

In [46]:
res.model_dump()

{'relevant_tables': []}