In [None]:
# Import core packages
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

## 2. Create a database

In [None]:
# db_creator.py

import json
import sqlalchemy as sa
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

# Create the base class for our models
Base = declarative_base()

# Define our State model
class State(Base):
    __tablename__ = 'states'
    
    id = sa.Column(sa.Integer, primary_key=True)
    object_id = sa.Column(sa.String(50), unique=True)
    name = sa.Column(sa.String(50), unique=True, nullable=False)
    flag_url = sa.Column(sa.String(255))
    link = sa.Column(sa.String(100))
    postal_abbreviation = sa.Column(sa.String(2), unique=True)
    capital = sa.Column(sa.String(50))
    largest_city = sa.Column(sa.String(50))
    established = sa.Column(sa.String(50))
    population = sa.Column(sa.Integer)
    total_area_square_miles = sa.Column(sa.Integer)
    total_area_square_kilometers = sa.Column(sa.Integer)
    land_area_square_miles = sa.Column(sa.Integer)
    land_area_square_kilometers = sa.Column(sa.Integer)
    water_area_square_miles = sa.Column(sa.Integer)
    water_area_square_kilometers = sa.Column(sa.Integer)
    number_representatives = sa.Column(sa.Integer)
    created_at = sa.Column(sa.String(100))
    updated_at = sa.Column(sa.String(100))
    capitals_object_id = sa.Column(sa.String(50))

def main():
    # Connect to the database
    engine = sa.create_engine('sqlite:///states.db', echo=True)
    
    # Create tables
    Base.metadata.create_all(engine)
    
    # Create a session
    Session = sessionmaker(bind=engine)
    session = Session()
    
    # Load the JSON data
    with open('states.json', 'r') as f:
        data = json.load(f)
        states_data = data.get('results', [])
    
    # Insert data
    for state_data in states_data:
        # Create a new State object
        state = State(
            object_id=state_data.get('objectId'),
            name=state_data.get('name'),
            flag_url=state_data.get('flag'),
            link=state_data.get('link'),
            postal_abbreviation=state_data.get('postalAbreviation'),
            capital=state_data.get('capital'),
            largest_city=state_data.get('largestCity'),
            established=state_data.get('established'),
            population=state_data.get('population'),
            total_area_square_miles=state_data.get('totalAreaSquareMiles'),
            total_area_square_kilometers=state_data.get('totalAreaSquareKilometers'),
            land_area_square_miles=state_data.get('landAreaSquareMiles'),
            land_area_square_kilometers=state_data.get('landAreaSquareKilometers'),
            water_area_square_miles=state_data.get('waterAreaSquareMiles'),
            water_area_square_kilometers=state_data.get('waterAreaSquareKilometers'),
            number_representatives=state_data.get('numberRepresentatives'),
            created_at=state_data.get('createdAt'),
            updated_at=state_data.get('updatedAt')
        )
        
        # Handle the capitals pointer
        if 'capitals' in state_data and isinstance(state_data['capitals'], dict):
            state.capitals_object_id = state_data['capitals'].get('objectId')
        
        # Add to session
        session.add(state)
    
    # Commit the session
    try:
        session.commit()
        print(f"Successfully imported {len(states_data)} states to the database.")
    except Exception as e:
        session.rollback()
        print(f"Error: Failed to commit data to database. {e}")
    finally:
        session.close()

if __name__ == "__main__":
    main()

In [7]:
# Test if db was created and populated correctly
import sqlalchemy as sa
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from db_creator import State
def test_database():
    """Test if the database was created and populated correctly."""
    print("\n----- TESTING DATABASE -----")
    
    # Connect to the database
    engine = sa.create_engine('sqlite:///states.db')
    Session = sessionmaker(bind=engine)
    session = Session()
    
    try:
        # Check if the table exists
        inspector = inspect(session.bind)
        tables = inspector.get_table_names()
        print(f"Tables in database: {tables}")
        
        if 'states' not in tables:
            print("Error: 'states' table not found in database!")
            return
        
        # Count total records
        state_count = session.query(State).count()
        print(f"Total states in database: {state_count}")
        
        # Check a few specific states
        print("\nSample state data:")
        sample_states = ['Alabama', 'Alaska', 'Wyoming']
        for state_name in sample_states:
            state = session.query(State).filter_by(name=state_name).first()
            if state:
                print(f"- {state.name}: Capital: {state.capital}, Population: {state.population}")
            else:
                print(f"- {state_name}: Not found in database!")
        
        # Show schema details
        print("\nTable columns:")
        for column in inspector.get_columns('states'):
            print(f"- {column['name']}: {column['type']}")
            
        # Run some additional queries
        print("\nStates with population over 5 million:")
        populous_states = session.query(State).filter(State.population > 5000000).all()
        for state in populous_states:
            print(f"{state.name}: {state.population}")
            
        print("\nStates with largest city same as capital:")
        capital_largest = session.query(State).filter(State.capital == State.largest_city).all()
        for state in capital_largest:
            print(f"{state.name}: {state.capital}")
    
    finally:
        session.close()

In [8]:
test_database()


----- TESTING DATABASE -----
Tables in database: ['states']
Total states in database: 50

Sample state data:
- Alabama: Capital: Montgomery, Population: 4874747
- Alaska: Capital: Juneau, Population: 739795
- Wyoming: Capital: Cheyenne, Population: 579315

Table columns:
- id: INTEGER
- object_id: VARCHAR(50)
- name: VARCHAR(50)
- flag_url: VARCHAR(255)
- link: VARCHAR(100)
- postal_abbreviation: VARCHAR(2)
- capital: VARCHAR(50)
- largest_city: VARCHAR(50)
- established: VARCHAR(50)
- population: INTEGER
- total_area_square_miles: INTEGER
- total_area_square_kilometers: INTEGER
- land_area_square_miles: INTEGER
- land_area_square_kilometers: INTEGER
- water_area_square_miles: INTEGER
- water_area_square_kilometers: INTEGER
- number_representatives: INTEGER
- created_at: VARCHAR(100)
- updated_at: VARCHAR(100)
- capitals_object_id: VARCHAR(50)

States with population over 5 million:
Arizona: 7016270
California: 39536653
Colorado: 5607154
Florida: 20984400
Georgia: 10429379
Illinois: 1

## 3. Enable Searching of SQL engine in plain english

In [71]:
# Create sql engine 
import sqlalchemy as sa

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI

import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


# Initialize LLM
llm = GoogleGenAI(
    model="models/gemini-2.5-pro-exp-03-25", 
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)
Settings.llm = llm

# Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)


# Create the SQL database wrapper
sql_database = SQLDatabase(engine, include_tables=["states"])

# Generate sample data description to help the LLM understand the data structure
# This creates better context for the NLSQLTableQueryEngine
table_schema_str = """
Database Schema:
- states table:
  - object_id: Object ID like 
  - name: State name (e.g., "California", "Texas")
  - flag_url: link to states flag png image
  - link: url link to states page
  - postal_abbreviation: Two-letter state code (e.g., "CA", "TX")
  - capital: Capital city name
  - largest_city: name of largest city
  - established: date when state was established
  - population: state's population
  - total_area_square_miles= state total area in square miles
  - total_area_square_kilometers: total area in square kilometers
  - land_area_square_miles: land area in square miles
  - land_area_square_kilometers: land area in square kilometers
  - water_area_square_miles: water area in square miles
  - water_area_square_kilometers: water area in square kilometers
  - number_representatives: number of representatives
  - created_at: date of database creation
  - updated_at: date of database update
"""

# Create the natural language to SQL query engine with more context
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["states"],
    sample_rows_in_table_info=2,  # Include sample rows to provide better context
    llm=llm,
    embed_model=llm,
    synthesize_response=True,
    table_schema_str=table_schema_str,
    verbose=True  # This enables debug output to see the generated SQL
)


### Test query engine

In [None]:
# Test sql engine  
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker 
from sqlalchemy import inspect

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI 

import os
from dotenv import load_dotenv 

# Load environment variables
load_dotenv()

print("Step 1: Checking environment variables...")
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
    print("ERROR: GOOGLE_API_KEY not found in environment variables")
else:
    print("✓ GOOGLE_API_KEY found")

print("\nStep 2: Initializing LLM...")
try:
    llm = GoogleGenAI(    
        model="models/gemini-2.5-exp-pro-03-25",     
        google_api_key=google_api_key,    
        temperature=0.3
    )
    Settings.llm = llm
    print("✓ LLM initialized successfully")
except Exception as e:
    print(f"ERROR initializing LLM: {e}")

print("\nStep 3: Connecting to database...")
try:
    engine = sa.create_engine('sqlite:///states.db', future=True)
    # Test the connection
    connection = engine.connect()
    connection.close()
    print("✓ Database connection successful")
    
    # Check if the 'states' table exists
    inspector = inspect(engine)
    if 'states' in inspector.get_table_names():
        print("✓ 'states' table found in database")
    else:
        print("ERROR: 'states' table not found in database")
        print(f"Available tables: {inspector.get_table_names()}")
except Exception as e:
    print(f"ERROR connecting to database: {e}")

print("\nStep 4: Creating SQL database wrapper...")
try:
    sql_database = SQLDatabase(engine, include_tables=["states"])
    print("✓ SQL database wrapper created")
    print(f"Tables included: {sql_database.get_usable_table_names()}")
except Exception as e:
    print(f"ERROR creating SQL database wrapper: {e}")

print("\nStep 5: Creating query engine...")
try:
    sql_query_engine = NLSQLTableQueryEngine(    
        sql_database=sql_database,    
        llm=llm,
        embed_model=llm,
        tables=["states"]
    )
    print("✓ Query engine created successfully")
except Exception as e:
    print(f"ERROR creating query engine: {e}")
    import traceback
    traceback.print_exc()

print("\nStep 6: Testing a simple query...")
if 'sql_query_engine' in locals():
    try:
        # Try with a very simple query first
        simple_query = "SELECT * FROM states LIMIT 5"
        print(f"Executing direct SQL query: {simple_query}")
        direct_results = sql_database.run_sql(simple_query)
        print(f"Direct SQL results: {direct_results[:100]}...")  # Print first 100 chars
        
        # Now try the natural language query
        nl_query = "What are the five most populous states?"
        print(f"Executing natural language query: {nl_query}")
        response = sql_query_engine.query(nl_query)
        print(f"Response type: {type(response)}")
        print(f"Response content: {response}")
    except Exception as e:
        print(f"ERROR executing query: {e}")
        import traceback
        traceback.print_exc()

Step 1: Checking environment variables...
✓ GOOGLE_API_KEY found

Step 2: Initializing LLM...
ERROR initializing LLM: [Errno 11001] getaddrinfo failed

Step 3: Connecting to database...
✓ Database connection successful
✓ 'states' table found in database

Step 4: Creating SQL database wrapper...
✓ SQL database wrapper created
Tables included: ['states']

Step 5: Creating query engine...
✓ Query engine created successfully

Step 6: Testing a simple query...
Executing direct SQL query: SELECT * FROM states LIMIT 5
Direct SQL results: ("[(1, 'B9IfALchYP', 'Alabama', 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Flag_of_Alabama.svg/23px-Flag_of_Alabama.svg.png', '/wiki/Alabama', 'AL', 'Montgomery', 'Birmingham', 'Dec 14, 1819', 4874747, 52420, 135767, 50645, 131171, 1775, 4597, 7, '2019-12-06T19:36:50.117Z', '2019-12-09T23:20:36.439Z', 't0wuThlqBx'), (2, 'zq8GVoD1n0', 'Alaska', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Flag_of_Alaska.svg/21px-Flag_of_Alaska.svg

## Setting up LlamaCloud

In [70]:
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

# Initialize the LlamaCloud index
index = LlamaCloudIndex(
    name="US-States-Wiki",
    project_name="sql_rag",
    organization_id=os.getenv("LLAMA_CLOUD_ORG_ID"),
    api_key=os.getenv("LLAMA_CLOUD_API_KEY")
)
# Create a query engine for the `US-States-Wiki` index
llama_cloud_query_engine = index.as_query_engine()

## Create query tool around the engines

In [72]:
# Step 3: Create query tools around engines
from llama_index.core.tools import QueryEngineTool

# Create a tool for SQL queries
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for answering factual questions about US states like population, "
        "capital, land size, postal abbreviation, and other demographic statistics "
        "stored in a structured database."
    ),
    name="sql_tool"
)
# Create a tool for document-based queries
llama_cloud_tool = QueryEngineTool.from_defaults(
    query_engine=llama_cloud_query_engine,
    description=(
        f"Useful for answering questions about US states' history, attractions, culture, "
        f"geography, and other information that requires searching through documents. "
        f"This tool contains Wikipedia information about all US states."
    ),
    name="llama_cloud_tool"
)


## Creating a ReAct Agentic workflow

In [None]:
# define tools
tools = [sql_tool, llama_cloud_tool]
# Create the agent with tools
from llama_index.core.agent import ReActAgent
agent = ReActAgent.from_tools(
    tools,
    llm=llm,
    verbose=True,
    system_prompt=(
        "You are an expert US States information system. "
        "You have access to two sources of information:\n\n"
        "1. A SQLite database with factual demographic data about US states in the 'states' table "
        "containing fields: object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, "
        "established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, "
        "land_area_square_kilometers, water_area_square_miles, "
        "water_area_square_kilometers, number_representatives, created_at, updated_at\n\n"
        "2. Document retrieval for detailed information about history, attractions, and more\n\n"
        "Choose the appropriate tool based on the user's question. "
        "For the SQL tool, formulate clear SQL queries that match the database schema. "
        "Use the SQL tool for factual queries about population, area, capitals, etc. "
        "Use the document tool for questions about history, attractions, culture, and detailed information. "
        "If needed, you can use both tools and combine the information."
    )
)

# Step 5: Function to handle user queries
def answer_state_question(query):
    """
    Answer questions about US states using the appropriate tools.
    Args:
        query (str): User's natural language query
    Returns:
        str: Response to the user's query
    """
    try:
        # Use the agent to answer the query
        response = agent.query(query)
        #return str(response) 
        return response
    except Exception as e:
        return f"Sorry, I couldn't process your query. Error: {str(e)}"

### Testing the ReActAgent

In [78]:
# Population question (should use SQL)
query = ("What is the population of California?")
print(f"\nQuestion: {query}")
print(f"Answer: {answer_state_question(query)}")



Question: What is the population of California?
> Running step 1a22de77-350d-409f-9ff4-612c53bbd6ee. Step input: What is the population of California?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question about the population of California. The `sql_tool` is suitable for retrieving specific demographic statistics like population for a US state.
Action: sql_tool
Action Input: {'input': 'What is the population of California?'}
[0m> Table Info: Table 'states' has columns: id (INTEGER), object_id (VARCHAR(50)), name (VARCHAR(50)), flag_url (VARCHAR(255)), link (VARCHAR(100)), postal_abbreviation (VARCHAR(2)), capital (VARCHAR(50)), largest_city (VARCHAR(50)), established (VARCHAR(50)), population (INTEGER), total_area_square_miles (INTEGER), total_area_square_kilometers (INTEGER), land_area_square_miles (INTEGER), land_area_square_kilometers (INTEGER), water_area_square_miles (INTEGER), water_area_square_kilometers (INTE

In [76]:
# Semantic question (should use LlamaCloud tool)
query = ("What are popular tourist attractions in Hawaii?")
print(f"\nQuestion: {query}")
print(f"Answer: {answer_state_question(query)}")


Question: What are popular tourist attractions in Hawaii?
> Running step fe448001-8e20-4566-926f-2560655bc455. Step input: What are popular tourist attractions in Hawaii?
[1;3;38;5;200mThought: The current language of the user is: English. I need to find information about popular tourist attractions in Hawaii. The `llama_cloud_tool` is suitable for this as it can search through documents containing information about state attractions.
Action: llama_cloud_tool
Action Input: {'input': 'What are popular tourist attractions in Hawaii?'}
[0m[1;3;34mObservation: Based on the provided information, Hawaii offers various attractions and events for visitors:

*   **National Parks and Monuments:** These include Haleakalā National Park on Maui, featuring a dormant volcano, and Hawaii Volcanoes National Park on Hawaiʻi Island, which includes the active Kīlauea volcano. Other sites under the National Park Service are Kalaupapa National Historical Park on Molokaʻi, Kaloko-Honokōhau National Histo

In [80]:
# Contextual question (should use both SQL and LlamaCloud tools)
query = ("Tell me about the availability of water in Arizona. What is the water area in Arizona")
print(f"\nQuestion: {query}")
print(f"Answer: {answer_state_question(query)}")


Question: Tell me about the availability of water in Arizona. What is the water area in Arizona
> Running step 8ecb4540-9076-4d8b-8dc9-9de98c45f813. Step input: Tell me about the availability of water in Arizona. What is the water area in Arizona
The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.
[1;3;38;5;200mThought: The user is asking two questions about Arizona: 1) general information about water availability, and 2) the specific water area statistic. The first question requires descriptive information likely found in documents, so I'll use the `llama_cloud_tool`. The second question asks for a specific statistic (water area), which is likely in the structured database, so I'll use the `sql_tool` for that. I'll start with the `llama_cloud_tool` for the general information. The user's language is English.
Action: llama_cloud_tool
Action Input: {'input': 'Describe the availability of water in

## Putting it all together

In [64]:
# Step 1: Set up the SQL Query Engine
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
import os
import asyncio

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI

# Load environment variables
load_dotenv()

# Initialize LLM
llm = GoogleGenAI(
    model="models/gemini-2.5-pro-exp-03-25",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)
Settings.llm = llm

# Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)
Session = sessionmaker(bind=engine)
session = Session()

# Create the SQL database wrapper
sql_database = SQLDatabase(engine, include_tables=["states"])

# Create the natural language to SQL query engine
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    embed_model=llm,
    llm=llm,
    tables=["states"]
)

# Step 2: Initialize the LlamaCloud index for document retrieval
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

# Initialize the LlamaCloud index
index = LlamaCloudIndex(
    name="US-States-Wiki",
    project_name="sql_rag",
    organization_id=os.getenv("LLAMA_CLOUD_ORG_ID"),
    api_key=os.getenv("LLAMA_CLOUD_API_KEY")
)

# Create a query engine for the `US-States-Wiki` index
llama_cloud_query_engine = index.as_query_engine()

# Step 3: Create query tools around engines
from llama_index.core.tools import QueryEngineTool

# Create a tool for SQL queries
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for answering factual questions about US states like population, "
        "capital, land size, postal abbreviation, and other demographic statistics "
        "stored in a structured database."
    ),
    name="sql_tool"
)
# Create a tool for document-based queries
llama_cloud_tool = QueryEngineTool.from_defaults(
    query_engine=llama_cloud_query_engine,
    description=(
        f"Useful for answering questions about US states' history, attractions, culture, "
        f"geography, and other information that requires searching through documents. "
        f"This tool contains Wikipedia information about all US states."
    ),
    name="llama_cloud_tool"
)

# Step 4: Build the Agent with the tools
from llama_index.core.agent import ReActAgent

# Create a list of tools
tools = [sql_tool, llama_cloud_tool]

# Create the agent with tools
agent = ReActAgent.from_tools(
    tools,
    llm=llm,
    verbose=True,
    system_prompt=(
        "You are an expert US States information system. "
        "You have access to two sources of information:\n\n"
        "1. A SQLite database with factual demographic data about US states in the 'states' table "
        "containing fields: object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, "
        "established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, "
        "land_area_square_kilometers, water_area_square_miles, "
        "water_area_square_kilometers, number_representatives, created_at, updated_at\n\n"
        "2. Document retrieval for detailed information about history, attractions, and more\n\n"
        "Choose the appropriate tool based on the user's question. "
        "For the SQL tool, formulate clear SQL queries that match the database schema. "
        "Use the SQL tool for factual queries about population, area, capitals, etc. "
        "Use the document tool for questions about history, attractions, culture, and detailed information. "
        "If needed, you can use both tools and combine the information."
    )
)

# Step 5: Function to handle user queries
def answer_state_question(query):
    """
    Answer questions about US states using the appropriate tools.
    
    Args:
        query (str): User's natural language query
        
    Returns:
        str: Response to the user's query
    """
    try:
        # Use the agent to answer the query
        response = agent.query(query)
        #return str(response) 
        return response
    except Exception as e:
        return f"Sorry, I couldn't process your query. Error: {str(e)}"

# Example usage
if __name__ == "__main__":
    # Test queries
    queries = [
        "What is the population of California?",
        "What are popular tourist attractions in Hawaii?",
        "Which state has the largest land area?",
        "Tell me about the history of New York."
    ]
    
    for query in queries:
        print(f"\nQuestion: {query}")
        print(f"Answer: {answer_state_question(query)}")
        print("-" * 80)


Question: What is the population of California?
> Running step 8a6e3608-0ae4-4317-9822-ba2a4a261c56. Step input: What is the population of California?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question. The user is asking for the population of California. This is a factual question about a US state's demographic statistics, which the `sql_tool` can answer.
Action: sql_tool
Action Input: {'input': 'What is the population of California?'}
[0m[1;3;34mObservation: The population of California is 39,536,653.
[0m> Running step a98a81be-40c0-4c05-97a9-c8b568801e11. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: The population of California is 39,536,653.
[0mAnswer: The population of California is 39,536,653.
--------------------------------------------------------------------------------

Question: What are popular tourist attractions 

In [66]:
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from dotenv import load_dotenv
import os
from IPython.display import display, Markdown
import asyncio

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI

# Load environment variables
load_dotenv()

# Initialize LLM
llm = GoogleGenAI(
    model="models/gemini-2.5-pro-exp-03-25",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)
Settings.llm = llm

# Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)
Session = sessionmaker(bind=engine)
session = Session()

# Get detailed table schema information
inspector = inspect(engine)
table_columns = {}
for table_name in inspector.get_table_names():
    columns = inspector.get_columns(table_name)
    table_columns[table_name] = [column['name'] for column in columns]

# Display available tables and columns for debugging
print("Available tables and columns:")
for table, columns in table_columns.items():
    print(f"Table: {table}")
    print(f"Columns: {', '.join(columns)}")
    print("-" * 80)

# Create the SQL database wrapper with explicit table info
sql_database = SQLDatabase(engine, include_tables=["states"])

# Generate sample data description to help the LLM understand the data structure
# This creates better context for the NLSQLTableQueryEngine
table_schema_str = """
Database Schema:
- states table:
  - object_id: Object ID like 
  - name: State name (e.g., "California", "Texas")
  - flag_url: link to states flag png image
  - link: url link to states page
  - postal_abbreviation: Two-letter state code (e.g., "CA", "TX")
  - capital: Capital city name
  - largest_city: name of largest city
  - established: date when state was established
  - population: state's population
  - total_area_square_miles= state total area in square miles
  - total_area_square_kilometers: total area in square kilometers
  - land_area_square_miles: land area in square miles
  - land_area_square_kilometers: land area in square kilometers
  - water_area_square_miles: water area in square miles
  - water_area_square_kilometers: water area in square kilometers
  - number_representatives: number of representatives
  - created_at: date of database creation
  - updated_at: date of database update
"""

# Create the natural language to SQL query engine with more context
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["states"],
    sample_rows_in_table_info=2,  # Include sample rows to provide better context
    llm=llm,
    embed_model=llm,
    synthesize_response=True,
    table_schema_str=table_schema_str,
    verbose=True  # This enables debug output to see the generated SQL
)

# Step 2: Initialize the LlamaCloud index for document retrieval
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

# Initialize the LlamaCloud index
index = LlamaCloudIndex(
    name="US-States-Wiki",
    project_name="sql_rag",
    organization_id=os.getenv("LLAMA_CLOUD_ORG_ID"),
    api_key=os.getenv("LLAMA_CLOUD_API_KEY")
)

# Create a query engine for the `US-States-Wiki` index
llama_cloud_query_engine = index.as_query_engine()

# Step 3: Create query tools around engines
from llama_index.core.tools import QueryEngineTool

# Create a tool for SQL queries with more specific description
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for answering factual questions about US states like population, "
        "capital, land size, postal abbreviation, and other demographic statistics "
        "stored in a structured database. The states table contains fields: "
        "name, abbreviation, capital, population, area_sq_miles, region, "
        "admitted_to_union, and median_household_income."
    ),
    name="sql_tool"
)

# Create a tool for document-based queries
llama_cloud_tool = QueryEngineTool.from_defaults(
    query_engine=llama_cloud_query_engine,
    description=(
        f"Useful for answering questions about US states' history, attractions, culture, "
        f"geography, and other information that requires searching through documents. "
        f"This tool contains Wikipedia information about all US states."
    ),
    name="llama_cloud_tool"
)

# Step 4: Build the Agent with the tools
from llama_index.core.agent import ReActAgent

# Create a list of tools
tools = [sql_tool, llama_cloud_tool]

# Create the agent with tools and improved system prompt
agent = ReActAgent.from_tools(
    tools,
    llm=llm,
    verbose=True,
    system_prompt=(
        "You are an expert US States information system. "
        "You have access to two sources of information:\n\n"
        "1. A SQLite database with factual demographic data about US states in the 'states' table "
        "containing fields: object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, "
        "established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, "
        "land_area_square_kilometers, water_area_square_miles, "
        "water_area_square_kilometers, number_representatives, created_at, updated_at\n\n"
        "2. Document retrieval for detailed information about history, attractions, and more\n\n"
        "Choose the appropriate tool based on the user's question. "
        "For the SQL tool, formulate clear SQL queries that match the database schema. "
        "Use the SQL tool for factual queries about population, area, capitals, etc. "
        "Use the document tool for questions about history, attractions, culture, and detailed information. "
        "If needed, you can use both tools and combine the information."
    )
)

# Step 5: Function to handle user queries with better error handling
def answer_state_question(query):
    """
    Answer questions about US states using the appropriate tools.
    Args:
        query (str): User's natural language query 
    Returns:
        str: Response to the user's query
    """
    try:
        # Use the agent to answer the query
        response = agent.query(query)
        return response
    except Exception as e:
        print(f"Error details: {str(e)}")
        # Try a more direct approach if the agent fails
        try:
            # For simple factual queries, use the SQL tool directly
            if any(word in query.lower() for word in ['population', 'capital', 'area', 'largest', 'smallest']):
                print("Falling back to direct SQL query...")
                return sql_query_engine.query(query)
            else:
                # For other queries, use the document tool directly
                print("Falling back to direct document query...")
                return llama_cloud_query_engine.query(query)
        except Exception as fallback_error:
            return f"Sorry, I couldn't process your query. Error: {str(e)}\nFallback error: {str(fallback_error)}"

# Example usage
if __name__ == "__main__":
    # Test queries focusing on SQL capabilities
    queries = [
        "What is the population of California?",
        "Which state has the largest land area?",
        "List the top 5 most populous states",
        "What is the capital of Texas?",
        "Which states have a population greater than 10 million?",
    ]
    
    for query in queries:
        print(f"\nQuestion: {query}")
        print(f"Answer: {answer_state_question(query)}")
        print("-" * 80)

Available tables and columns:
Table: states
Columns: id, object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, land_area_square_kilometers, water_area_square_miles, water_area_square_kilometers, number_representatives, created_at, updated_at, capitals_object_id
--------------------------------------------------------------------------------

Question: What is the population of California?
> Running step 3862da18-729e-41d5-a1a6-608feda321df. Step input: What is the population of California?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question about the population of California. The `sql_tool` is suitable for retrieving specific demographic statistics like population for a US state.
Action: sql_tool
Action Input: {'input': 'What is the population of California?'}
[0m> Table Info: Table 'states'

## Final version

In [68]:
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from dotenv import load_dotenv
import os

import asyncio

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI

# Load environment variables
load_dotenv()

# 1. Initialize LLM
llm = GoogleGenAI(
    model="models/gemini-2.5-pro-exp-03-25",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)
Settings.llm = llm

# 2. Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)

# Create the SQL database wrapper with explicit table info
sql_database = SQLDatabase(engine, include_tables=["states"])

# Generate sample data description to help the LLM understand the data structure
# This creates better context for the NLSQLTableQueryEngine
table_schema_str = """
Database Schema:
- states table:
  - object_id: Object ID like 
  - name: State name (e.g., "California", "Texas")
  - flag_url: link to states flag png image
  - link: url link to states page
  - postal_abbreviation: Two-letter state code (e.g., "CA", "TX")
  - capital: Capital city name
  - largest_city: name of largest city
  - established: date when state was established
  - population: state's population
  - total_area_square_miles= state total area in square miles
  - total_area_square_kilometers: total area in square kilometers
  - land_area_square_miles: land area in square miles
  - land_area_square_kilometers: land area in square kilometers
  - water_area_square_miles: water area in square miles
  - water_area_square_kilometers: water area in square kilometers
  - number_representatives: number of representatives
  - created_at: date of database creation
  - updated_at: date of database update
"""

# Create the natural language to SQL query engine with more context
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["states"],
    sample_rows_in_table_info=1,  # Include sample rows to provide better context
    llm=llm,
    embed_model=llm,
    synthesize_response=True,
    markdown_response=True,
    table_schema_str=table_schema_str,
    verbose=True  # This enables debug output to see the generated SQL
)

# Step 2: Initialize the LlamaCloud index for document retrieval
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

# Initialize the LlamaCloud index
index = LlamaCloudIndex(
    name="US-States-Wiki",
    project_name="sql_rag",
    organization_id=os.getenv("LLAMA_CLOUD_ORG_ID"),
    api_key=os.getenv("LLAMA_CLOUD_API_KEY")
)

# Create a query engine for the `US-States-Wiki` index
llama_cloud_query_engine = index.as_query_engine()

# Step 3: Create query tools around engines
from llama_index.core.tools import QueryEngineTool

# Create a tool for SQL queries with more specific description
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for answering factual questions about US states like population, "
        "capital, land size, postal abbreviation, and other demographic statistics "
        "stored in a structured database. The states table contains fields: "
        "name, abbreviation, capital, population, area_sq_miles, region, "
        "admitted_to_union, and median_household_income."
    ),
    name="sql_tool"
)

# Create a tool for document-based queries
llama_cloud_tool = QueryEngineTool.from_defaults(
    query_engine=llama_cloud_query_engine,
    description=(
        f"Useful for answering questions about US states' history, attractions, culture, "
        f"geography, and other information that requires searching through documents. "
        f"This tool contains Wikipedia information about all US states."
    ),
    name="llama_cloud_tool"
)

# Step 4: Build the Agent with the tools
from llama_index.core.agent import ReActAgent

# Create a list of tools
tools = [sql_tool, llama_cloud_tool]

# Create the agent with tools and improved system prompt
agent = ReActAgent.from_tools(
    tools,
    llm=llm,
    verbose=True,
    system_prompt=(
        "You are an expert US States information system. "
        "You have access to two sources of information:\n\n"
        "1. A SQLite database with factual demographic data about US states in the 'states' table "
        "containing fields: object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, "
        "established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, "
        "land_area_square_kilometers, water_area_square_miles, "
        "water_area_square_kilometers, number_representatives, created_at, updated_at\n\n"
        "2. Document retrieval for detailed information about history, attractions, and more\n\n"
        "Choose the appropriate tool based on the user's question. "
        "For the SQL tool, formulate clear SQL queries that match the database schema. "
        "Use the SQL tool for factual queries about population, area, capitals, etc. "
        "Use the document tool for questions about history, attractions, culture, and detailed information. "
        "If needed, you can use both tools and combine the information."
    )
)

# Step 5: Function to handle user queries
def answer_state_question(query):
    """
    Answer questions about US states using the appropriate tools.
    Args:
        query (str): User's natural language query 
    Returns:
        str: Response to the user's query
    """
    try:
        # Use the agent to answer the query
        response = agent.query(query)
        return response
    except Exception as e:
        print(f"Error details: {str(e)}")
        # Try a more direct approach if the agent fails
        try:
            # For simple factual queries, use the SQL tool directly
            if any(word in query.lower() for word in ['population', 'capital', 'area', 'largest', 'smallest']):
                print("Falling back to direct SQL query...")
                return sql_query_engine.query(query)
            else:
                # For other queries, use the document tool directly
                print("Falling back to direct document query...")
                return llama_cloud_query_engine.query(query)
        except Exception as fallback_error:
            return f"Sorry, I couldn't process your query. Error: {str(e)}\nFallback error: {str(fallback_error)}"

# Example usage
if __name__ == "__main__":
    # Test queries focusing on SQL capabilities
    queries = [
        "What is the population of California?",
        "Which state has the largest land area?",
        "List the top 5 most populous states",
        "What is the capital of Texas?",
        "Which states have a population greater than 10 million?",
    ]
    
    for query in queries:
        print(f"\nQuestion: {query}")
        print(f"Answer: {answer_state_question(query)}")
        print("-" * 80)


Question: What is the population of California?
> Running step dfad9c12-e8a8-4dd8-b543-88cfe0b4b9a5. Step input: What is the population of California?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question about the population of California. The `sql_tool` is suitable for retrieving specific demographic data like population from a structured database about US states.
Action: sql_tool
Action Input: {'input': 'What is the population of California?'}
[0m> Table Info: Table 'states' has columns: id (INTEGER), object_id (VARCHAR(50)), name (VARCHAR(50)), flag_url (VARCHAR(255)), link (VARCHAR(100)), postal_abbreviation (VARCHAR(2)), capital (VARCHAR(50)), largest_city (VARCHAR(50)), established (VARCHAR(50)), population (INTEGER), total_area_square_miles (INTEGER), total_area_square_kilometers (INTEGER), land_area_square_miles (INTEGER), land_area_square_kilometers (INTEGER), water_area_square_miles (INTEGER), water_area_s

In [32]:
import sqlalchemy as sa
from sqlalchemy import text
import pandas as pd

# Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)

def execute_query(query_str, params=None):
    """Execute a SQL query and return the results"""
    try:
        with engine.connect() as conn:
            if params:
                result = conn.execute(text(query_str), params)
            else:
                result = conn.execute(text(query_str))
            
            # Convert results to DataFrame for nice display
            df = pd.DataFrame(result.fetchall())
            if not df.empty:
                df.columns = result.keys()
            return df
    except Exception as e:
        return f"Error: {str(e)}"

# First, let's see the table structure
def get_table_info():
    """Get the table structure and sample data"""
    schema_query = "PRAGMA table_info(states)"
    schema_df = execute_query(schema_query)
    print("Table Structure:")
    print(schema_df)
    print("\n")
    
    # Get sample data
    sample_query = "SELECT * FROM states LIMIT 5"
    sample_df = execute_query(sample_query)
    print("Sample Data:")
    print(sample_df)
    print("\n")
    
    # Count total rows
    count_query = "SELECT COUNT(*) as count FROM states"
    count_df = execute_query(count_query)
    print("Total States:")
    print(count_df)
    print("\n")

# Test different ways to query states by name
def test_name_queries():
    print("Testing different ways to query state names:")
    
    # Test 1: Double quotes
    print("\nTest 1: Double quotes")
    query1 = 'SELECT capital FROM states WHERE name = "Texas"'
    print(f"Query: {query1}")
    print(execute_query(query1))
    
    # Test 2: Single quotes
    print("\nTest 2: Single quotes")
    query2 = "SELECT capital FROM states WHERE name = 'Texas'"
    print(f"Query: {query2}")
    print(execute_query(query2))
    
    # Test 3: Parameter binding
    print("\nTest 3: Parameter binding")
    query3 = "SELECT capital FROM states WHERE name = :state_name"
    print(f"Query: {query3} with params: {{'state_name': 'Texas'}}")
    print(execute_query(query3, {"state_name": "Texas"}))
    
    # Test 4: LIKE operator
    print("\nTest 4: LIKE operator")
    query4 = "SELECT capital FROM states WHERE name LIKE '%Texas%'"
    print(f"Query: {query4}")
    print(execute_query(query4))
    
    # Test 5: Case insensitive comparison
    print("\nTest 5: Case insensitive comparison")
    query5 = "SELECT capital FROM states WHERE LOWER(name) = LOWER('Texas')"
    print(f"Query: {query5}")
    print(execute_query(query5))

# Test working queries
def test_working_queries():
    print("Testing working queries:")
    
    # Population query
    print("\nPopulation query:")
    query1 = "SELECT name, population FROM states WHERE population > 10000000 ORDER BY population DESC"
    print(f"Query: {query1}")
    print(execute_query(query1))
    
    # Top 5 populous states
    print("\nTop 5 populous states:")
    query2 = "SELECT name, population FROM states ORDER BY population DESC LIMIT 5"
    print(f"Query: {query2}")
    print(execute_query(query2))
    
    # Area query
    print("\nLargest states by area:")
    query3 = "SELECT name, area_sq_miles FROM states ORDER BY area_sq_miles DESC LIMIT 5"
    print(f"Query: {query3}")
    print(execute_query(query3))

# Check for exact state name formats
def check_state_names():
    print("Checking exact state name formats in the database:")
    query = "SELECT name FROM states"
    state_names = execute_query(query)
    print(state_names)
    print("\n")
    
    # Check for Texas specifically
    print("Looking for Texas with exact matching:")
    for name in state_names['name'].values:
        if 'texas' in name.lower():
            print(f"Found: '{name}'")

# Run the tests
if __name__ == "__main__":
    print("=== SQLite States Database Test ===\n")
    get_table_info()
    check_state_names()
    test_name_queries()
    test_working_queries()
    
    # Based on test results, recommend fixes
    print("\n=== RECOMMENDATIONS ===")
    print("If all tests with state names failed, use this query function in your agent:")
    print("""
def fixed_sql_query(query_str):
    \"\"\"Fix and execute SQL queries for states database\"\"\"
    # Replace double quotes with single quotes for string literals
    query_str = query_str.replace('\"', '\'')
    
    # When using state names or text fields, use parameter binding
    # Example:
    # Instead of: SELECT * FROM states WHERE name = 'Texas'
    # Use: SELECT * FROM states WHERE LOWER(name) = LOWER(:state_name)
    # And pass parameters: {"state_name": "Texas"}
    
    # For the agent integration, consider using text comparison functions
    # to handle case sensitivity and possible formatting issues
    return query_str
    """)

=== SQLite States Database Test ===

Table Structure:
    cid                          name          type  notnull dflt_value  pk
0     0                            id       INTEGER        1       None   1
1     1                     object_id   VARCHAR(50)        0       None   0
2     2                          name   VARCHAR(50)        1       None   0
3     3                      flag_url  VARCHAR(255)        0       None   0
4     4                          link  VARCHAR(100)        0       None   0
5     5           postal_abbreviation    VARCHAR(2)        0       None   0
6     6                       capital   VARCHAR(50)        0       None   0
7     7                  largest_city   VARCHAR(50)        0       None   0
8     8                   established   VARCHAR(50)        0       None   0
9     9                    population       INTEGER        0       None   0
10   10       total_area_square_miles       INTEGER        0       None   0
11   11  total_area_square_kilomet

## Test some queries

In [50]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT population FROM states WHERE name = 'Texas';"))
    print(result.fetchall())  # Expected output: [('Austin',)]

[(28304596,)]


In [53]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT capital FROM states WHERE name = 'California';"))
    print(result.fetchall())  # Expected output: [('Austin',)]

[('Sacramento',)]


In [57]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT capital FROM states WHERE name = 'Texas'"))
    print(result.fetchall())  # Expected output: [('Austin',)]

[('Austin',)]


In [62]:
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from dotenv import load_dotenv
import os
from IPython.display import display, Markdown
import asyncio
import re

from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import SQLDatabase, Settings
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
from llama_index.core.tools import QueryEngineTool
from llama_index.core.agent import ReActAgent

# Load environment variables
load_dotenv()

# Initialize LLM
llm = GoogleGenAI(
    model="models/gemini-2.5-pro-exp-03-25",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)
Settings.llm = llm

# Connect to the database
engine = sa.create_engine('sqlite:///states.db', future=True)
Session = sessionmaker(bind=engine)
session = Session()

# Get detailed table schema information
inspector = inspect(engine)
table_columns = {}
for table_name in inspector.get_table_names():
    columns = inspector.get_columns(table_name)
    table_columns[table_name] = [column['name'] for column in columns]

# Display available tables and columns for debugging
print("Available tables and columns:")
for table, columns in table_columns.items():
    print(f"Table: {table}")
    print(f"Columns: {', '.join(columns)}")
    print("-" * 80)

# Create the SQL database wrapper with explicit table info
sql_database = SQLDatabase(engine, include_tables=["states"])

# Generate sample data description to help the LLM understand the data structure
table_schema_str = """
Database Schema:
- states table:
  - object_id: Object ID like 
  - name: State name (e.g., "California", "Texas")
  - flag_url: link to states flag png image
  - link: url link to states page
  - postal_abbreviation: Two-letter state code (e.g., "CA", "TX")
  - capital: Capital city name
  - largest_city: name of largest city
  - established: date when state was established
  - population: state's population
  - total_area_square_miles= state total area in square miles
  - total_area_square_kilometers: total area in square kilometers
  - land_area_square_miles: land area in square miles
  - land_area_square_kilometers: land area in square kilometers
  - water_area_square_miles: water area in square miles
  - water_area_square_kilometers: water area in square kilometers
  - number_representatives: number of representatives
  - created_at: date of database creation
  - updated_at: date of database update
"""

# Create the natural language to SQL query engine with more context
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["states"],
    sample_rows_in_table_info=2,  # Include sample rows to provide better context
    llm=llm,
    embed_model=llm,
    synthesize_response=True,
    table_schema_str=table_schema_str,
    verbose=True  # This enables debug output to see the generated SQL
)

# Initialize the LlamaCloud index
index = LlamaCloudIndex(
    name="US-States-Wiki",
    project_name="sql_rag",
    organization_id="4134a95e-28e9-4996-8f81-c7ffebd05f99",
    api_key="llx-2U4S9cG2ii7YQFRsKOTkOmTtxJlMTnv8kHrpYzkT3gWJUEfK"
)

# Create a query engine for the `US-States-Wiki` index
llama_cloud_query_engine = index.as_query_engine()

# Create query tools around engines
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for answering factual questions about US states like population, "
        "capital, land size, postal abbreviation, and other demographic statistics "
        "stored in a structured database. The states table contains fields: "
        "name, abbreviation, capital, population, area_sq_miles, region, "
        "admitted_to_union, and median_household_income."
    ),
    name="sql_tool"
)

states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", 
    "Connecticut", "Delaware", "Florida", "Georgia (U.S. state)", "Hawaii", "Idaho", 
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", 
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", 
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", 
    "New Hampshire", "New Jersey", "New Mexico", "New York", 
    "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", 
    "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", 
    "West Virginia", "Wisconsin", "Wyoming"
]

llama_cloud_tool = QueryEngineTool.from_defaults(
    query_engine=llama_cloud_query_engine,
    description=(
        f"Useful for answering questions about US states' history, attractions, culture, "
        f"geography, and other information that requires searching through documents. "
        f"This tool contains Wikipedia information about all US states: {', '.join(states)}."
    ),
    name="llama_cloud_tool"
)

# Build the Agent with the tools
tools = [sql_tool, llama_cloud_tool]

agent = ReActAgent.from_tools(
    tools,
    llm=llm,
    verbose=True,
    system_prompt=(
        "You are an expert US States information system. "
        "You have access to two sources of information:\n\n"
        "1. A SQLite database with factual demographic data about US states in the 'states' table "
        "containing fields: object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, "
        "established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, "
        "land_area_square_kilometers, water_area_square_miles, "
        "water_area_square_kilometers, number_representatives, created_at, updated_at\n\n"
        "2. Document retrieval for detailed information about history, attractions, and more\n\n"
        "Choose the appropriate tool based on the user's question. "
        "For the SQL tool, formulate clear SQL queries that match the database schema. "
        "Use the SQL tool for factual queries about population, area, capitals, etc. "
        "Use the document tool for questions about history, attractions, culture, and detailed information. "
        "If needed, you can use both tools and combine the information."
    )
)

# Function to clean SQL queries by removing unwanted prefixes like 'ite'
def clean_sql_query(query: str) -> str:
    """
    Fix malformed SQL queries by:
    - Removing unwanted prefixes (e.g., 'ite' or other incorrect text before SELECT)
    - Ensuring proper formatting before execution
    """
    query = query.strip()

    # Remove any leading non-SQL words (like 'ite' or unexpected characters)
    query = re.sub(r'^\s*\b\w+\b\s*', '', query, count=1)

    return query.strip()

# Function to handle user queries with better error handling
def answer_state_question(query):
    """
    Answer questions about US states using the appropriate tools.
    Args:
        query (str): User's natural language query 
    Returns:
        str: Response to the user's query
    """
    try:
        response = agent.query(query)
        
        # If the response contains an SQL error, attempt to clean the query
        if "error" in str(response).lower():
            print("🔄 Retrying after fixing SQL query...")

            # Extract the predicted SQL query (if available)
            extracted_query = response.metadata.get("sql_query", "")

            if extracted_query:
                fixed_query = clean_sql_query(extracted_query)
                return sql_query_engine.query(fixed_query)

        return response

    except Exception as e:
        print(f"⚠️ Error details: {str(e)}")
        
        try:
            # Direct SQL fallback if necessary
            if any(word in query.lower() for word in ['population', 'capital', 'area', 'largest']):
                print("🔄 Falling back to direct SQL query...")
                return sql_query_engine.query(clean_sql_query(query))
            else:
                print("🔄 Falling back to direct document query...")
                return llama_cloud_query_engine.query(query)
        except Exception as fallback_error:
            return f"❌ Sorry, I couldn't process your query. Error: {str(e)}\nFallback error: {str(fallback_error)}"

# Test queries focusing on SQL capabilities
if __name__ == "__main__":
    queries = [
        "What is the population of California?",
        "Which state has the largest land area?",
        "List the top 5 most populous states",
        "What is the capital of Texas?",
        "Which states have a population greater than 10 million?",
    ]
    
    for query in queries:
        print(f"\nQuestion: {query}")
        print(f"Answer: {answer_state_question(query)}")
        print("-" * 80)


Available tables and columns:
Table: states
Columns: id, object_id, name, flag_url, link, postal_abbreviation, capital, largest_city, established, population, total_area_square_miles, total_area_square_kilometers, land_area_square_miles, land_area_square_kilometers, water_area_square_miles, water_area_square_kilometers, number_representatives, created_at, updated_at, capitals_object_id
--------------------------------------------------------------------------------

Question: What is the population of California?
> Running step ffe81ced-7743-4469-a93e-c43dbc5aee6b. Step input: What is the population of California?
[1;3;38;5;200mThought: The user is asking for the population of California. This is a factual question about a US state statistic. The `sql_tool` is described as useful for answering factual questions about US states like population. I should use the `sql_tool` to get this information.
Action: sql_tool
Action Input: {'input': 'What is the population of California?'}
[0m> Ta