In [None]:
# Install required packages
!pip install --upgrade --quiet langchain langchain-core langchain-groq langchain-community langchain-openai

# Import necessary libraries
import os
import re
import json
import time
import math
import ast
from google.colab import userdata
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent
from langchain_openai import ChatOpenAI

Environment Setup and API Configuration

In [None]:
# Set up environment variables
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY_4')

# Load Table Metadata
with open("/content/table_metadata.txt", "r") as file:
    table_metadata = json.load(file)

# Connect to SQLite Database
db = SQLDatabase.from_uri("sqlite:///SAVI_AI.db")

LLM Configuration and Prompt Templates

In [None]:
# Initialize LLM
llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile"
)

# Create SQL Query Prompt Template
sql_prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are an AI assistant specializing in SQL queries for demographic and social data analysis.
    Follow these specific rules when converting natural language to SQL:

    1. Column Naming:
       - CRITICAL: Column names with hyphens MUST be wrapped in double quotes like "Poverty Rate - Marion County"
       - NEVER use underscores to replace hyphens in column names (do NOT use Poverty_Rate_Marion_County)
       - Example correct format: "Poverty Rate - Marion - Black", NOT Poverty_Rate_-_Marion_-_Black
       - When referencing counties, use only the base name in the COUNTY column (e.g., 'Adams' not 'Adams County')
       - For county-specific queries, use COUNTY = 'CountyName' AND STATE = 'StateName'

    2. Percentage Values:
       - Percentage columns store values as whole numbers (e.g., 25.5 means 25.5%)
       - When filtering with percentage thresholds, use the actual number (e.g., > 25, not > 0.25)

    3. Time-based Queries:
       - Always include YEAR column in WHERE clause when years are mentioned
       - CRITICAL: Always include YEAR in the SELECT clause when filtering by years
       - When showing data across years, YEAR should always be the first column in the results
       - For year ranges, use YEAR BETWEEN start_year AND end_year

    4. Column Selection and Result Formatting:
       - Include ONLY the columns specifically mentioned in the query or needed for the answer
       - For "highest" or "top" requests, use ORDER BY column DESC LIMIT n
       - For "lowest" requests, use ORDER BY column ASC LIMIT n
       - When asked for "last n years", order by YEAR DESC LIMIT n
       - When asked for differences between values, ensure proper column name quoting

    5. Database-Specific:
       - Always return exactly what is requested - don't add extra columns unless needed for context
    """),
    ("human", "{input}")
])

# Create Results Explanation Prompt Template
explanation_prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are an AI assistant specializing in explaining data analysis results.
    Given a user's question, the SQL query that was run, and the results returned,
    create a clear, concise, and helpful natural language explanation of the results.

    Guidelines:
    - Format numerical values appropriately (add commas, $ for money, % for percentages)
    - Directly answer the user's original question
    - When results include multiple rows, summarize the key findings

    """),
    ("human", """
    Original question: {user_query}

    SQL query: {sql_query}

    Query results: {query_result}

    Please provide a natural language explanation of these results.
    """)
])

# Create SQL Agent
agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)

Query Processing Functions

In [None]:
def generate_sql_response(user_input):
    """Generate SQL query from natural language input using LLM"""
    formatted_prompt = sql_prompt.format(input=user_input)
    response = llm.invoke([formatted_prompt])
    return response.content

def extract_sql_query(response_text):
    """Extract SQL query from LLM response"""
    sql_match = re.search(r'```sql\n(.*?)\n```', response_text, re.DOTALL)
    if sql_match:
        return sql_match.group(1).strip()
    else:
        # If no code block found, try to find anything that looks like SQL
        sql_match = re.search(r'SELECT.*?FROM.*?', response_text, re.DOTALL | re.IGNORECASE)
        if sql_match:
            return sql_match.group(0).strip()
        return response_text.strip()

def format_result_with_llm(user_query, sql_query, query_result):
    """Use the LLM to generate a natural language explanation of the SQL results"""

    # Format the result for better readability
    if isinstance(query_result, list):
        # Try to format as a readable table for complex results
        if len(query_result) > 0 and len(query_result) <= 10:
            formatted_result = "\n".join([str(row) for row in query_result])
        else:
            formatted_result = str(query_result)
    else:
        formatted_result = str(query_result)

    formatted_prompt = explanation_prompt.format(
        user_query=user_query,
        sql_query=sql_query,
        query_result=formatted_result
    )

    response = llm.invoke([formatted_prompt])
    return response.content

def query_data(user_input):
    """Process user input, generate SQL query, and execute against database"""
    # Prepare schema information for context
    schema_instruction = "You can choose from the following tables and columns:\n"
    for table_name, meta in table_metadata.items():
        schema_instruction += f"Table: {table_name}\n"
        schema_instruction += f"Description: {meta['description']}\n"
        schema_instruction += "Columns:\n"
        for column_name, column_meta in meta['columns'].items():
            schema_instruction += f"  - {column_name}: {column_meta['description']}\n"
        schema_instruction += "\n"

    full_input = f"{schema_instruction}\n{user_input}"

    try:
        # Generate SQL from natural language
        response_text = generate_sql_response(full_input)
        sql_query = extract_sql_query(response_text)

        # Execute the query against the database
        query_response = db.run(sql_query)

        return sql_query, query_response
    except Exception as e:
        return None, f"Error processing request: {str(e)}"

def process_query(user_query, show_sql=False):
    """Process a query and return natural language explanation of results"""
    sql_query, response = query_data(user_query)

    if sql_query is None:
        return f"I'm sorry, but I encountered an error when processing your request: {response}"

    # Generate natural language explanation
    explanation = format_result_with_llm(user_query, sql_query, response)

    # Return results
    if show_sql:
        return f"SQL Query: {sql_query}\n\n{explanation}"
    else:
        return explanation

Find the year with the highest poverty rate for Black individuals in Marion County

In [None]:
def run_interactive_query():
    """Run an interactive query and display results"""
    user_query = input("Enter your question about the data: ")
    if user_query.strip():
        print("\nProcessing your query...\n")
        result = process_query(user_query, show_sql=True)
        print(result)
    else:
        print("Empty query. Please try again with a valid question.")

# Run an interactive query
run_interactive_query()

Enter your question about the data: Find the year with the highest poverty rate for Black individuals in Marion County

Processing your query...

SQL Query: SELECT YEAR, "Poverty Rate - Marion - Black" 
FROM savi_basic_needs_data 
ORDER BY "Poverty Rate - Marion - Black" DESC 
LIMIT 1;

The year with the highest poverty rate for Black individuals in Marion County was 2015, with a poverty rate of 29.5%. This means that approximately 29.5% of the Black population in Marion County lived in poverty that year.


Which counties have the largest area in square miles and return the top 3 results?

In [None]:
def run_interactive_query():
    """Run an interactive query and display results"""
    user_query = input("Enter your question about the data: ")
    if user_query.strip():
        print("\nProcessing your query...\n")
        result = process_query(user_query, show_sql=True)
        print(result)
    else:
        print("Empty query. Please try again with a valid question.")

# Run an interactive query
run_interactive_query()

Enter your question about the data: Which counties have the largest area in square miles and return the top 3 results?

Processing your query...

SQL Query: SELECT COUNTY, AREA_SQMI
FROM svi_indiana_in
ORDER BY AREA_SQMI DESC
LIMIT 3;

Based on the data, the top 3 counties in Indiana with the largest area in square miles are: 

1. Allen County, with an area of 657.30 square miles, 
2. LaPorte County, with an area of 598.29 square miles, and 
3. Jasper County, with an area of 559.68 square miles.

These three counties have the largest land areas in the state, with Allen County being the largest.
