In [None]:
from google.cloud import bigquery
from vertexai.generative_models import GenerativeModel, GenerationConfig
import warnings
import time

# Initialize tools
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Do this: pip install google-cloud-bigquery-storage and db_dtypes
# Suppress warnings with a specific message
warnings.filterwarnings(
    "ignore", 
    message="Your application has authenticated using end user credentials from Google Cloud SDK without a quota project."
)

In [None]:
def generate_response(prompt, model_name="gemini-2.0-flash", temperature=0.4, top_k=30, top_p=0.3):
    ai_client = GenerativeModel(
        model_name=model_name,
        generation_config=GenerationConfig(
            temperature=temperature,  
            top_k=top_k,  
            top_p=top_p
        )
    )
    response_result = ai_client.generate_content(prompt)
    return response_result.candidates[0].content.text

In [None]:
# Setup
project_name = "snowplow-cto-office"
bigquery_client = bigquery.Client(project=project_name)
product = "smartbill"
#product = "vnet_autopay"
dataset_id = f"""snowplow-cto-office.snowplow_{product}"""
#full_responses_table_name = f"""snowplow-cto-office.snowplow_{product}.Vismanet_AutoPay_events_filtered_L60D"""
full_responses_table_name = f"""snowplow-cto-office.snowplow_{product}.{product}_session_chunks"""
#Vismanet_AutoPay_events_filtered_L60D
query_headers = f"""SELECT
        column_name AS field_name,
        data_type AS field_type,
        description AS field_description,
    FROM
        `{dataset_id}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE
        table_name = '{product}_session_chunks';
    """

query_job = bigquery_client.query(query_headers)
table = query_job.result()

rows = list(table)

all_fields_type = {row[0]: row[1] for row in rows}
all_fields_desc = {row[0]: row[2] for row in rows}
all_fields_names = {row[0]: row[0] for row in rows}

In [None]:
def field_selection(bigquery_client, prompt, keywords, expected):
    print("Bigquery start.")
    #query_job = bigquery_client.query(query_headers)
    #table = query_job.result()
    # headers_values = []
    #for row in table:
    #    headers_values.append(row[:])
    text_with_headers = f"""

    **Prompt:**
    - {prompt}

    **Expected result:**
    - {expected}

    **Table schema:**
    - Name of fields: {all_fields_names}
    - Data type of values in the fields: {all_fields_type}
    - Field descriptions: {all_fields_desc}

    **Instructions:**
    1. Identify and list all relevant fields from the table schema that are most useful for answering the prompt and aligning with the expected result.
    2. Only include fields that match the specified keywords: {keywords}.
    3. Do not include any extraneous symbols, such as JSON formatting, brackets, or custom fields.
    4. For each selected field, explain its relevance and how it helps to answer the prompt.

    **Important:**
    - If **all necessary fields** to answer the prompt are present in the schema, proceed to list the fields and their corresponding use cases.
    - If **any critical data** is missing that would prevent a valid response to the prompt, return only:
    - `False, [explanation]`
    - Provide a brief explanation of the missing data.
    - Do **not** make assumptions or workarounds unless explicitly allowed.

    Output format:
    field_name1, use case
    field_name2, use case
    ...

    Do **not** hallucinate or infer information. If the prompt mentions a clearly defined concept or event that is missing from the available data, return:
    False, [explanation]

    """

    print("Bigquery end.")
    fields_response = generate_response(
        text_with_headers, 
        temperature=0.1, 
        top_k=30, 
        top_p=0.3
    )

    print("----------------------------------------------------------------------------------")
    print(fields_response)


    lines = fields_response.split("\n")
    first_part = []
    second_part = []
    var_type = []


    for line in lines:
        if "," in line:
            first, second = line.split(",", 1) 
            first_part.append(first.strip())  
            second_part.append(second.strip())  

    for i in first_part:
        var_type.append(all_fields_type[i])

    fields_type = {
        first_part[i]: {
            'data_type': var_type[i],
        }
        for i in range(len(first_part))
    }

    fields_desc = {
        first_part[i]: {
            'description': second_part[i],
        }
        for i in range(len(first_part))
    }

    return fields_type, fields_desc

In [None]:
def aggregation_strategy(prompt, fields_type, fields_desc, keywords, expected_result):
    final_prompt = f"""Determine the best SQL approach for the prompt.
    ### **Prompt:**  
    - {prompt}

    ### **Keywords:**  
    - {keywords}  

    ### **Expected result:**  
    - {expected_result}  

    ### **Table:**  
    - Data type of fields: {fields_type}
    - Description of fields {fields_desc}
    - {full_responses_table_name}  

    ### **Constraints:**  
    - Use only the given fields in the table. No external data.  
    - No extra context, table and data beyond these fields.  

    ### **Steps:**  
    1. Identify needed insights (e.g., trends, counts).  
    2. Justify field selection based on its relevance to the prompt, selecting only those that are necessary.
    3. Answer the prompt directly using the keywords.  
    4. Apply appropriate aggregations unless per-record analysis is required.  
    5. Avoid `LIMIT` unless absolutely necessary (BigQuery requires a literal integer). 

    ### **SQL Strategy:**  
    - Apply relevant aggregation, filters, and calculations.  
    - Assume all data comes from provided fields.
    - Keep your strategy concise.  
    - Do not infer causal relationships unless the data is explicitly present.

    **Do not generate SQL code.** Outline the strategy concisely. 

    ### **Output:**  
    - **Assumptions:** ("text" text)  
    - **Step-by-Step Strategy:** ("Title/Purpose" - do this)  
    - **Expected Result:** The produced data table must fully align with keywords and the given expected results. 
    """

    sql_strategy_response = generate_response(
        final_prompt, 
        temperature=0.2, 
        top_k=40, 
        top_p=0.5
    )
    print(sql_strategy_response)

    return sql_strategy_response

In [None]:
def create_sql_code(prompt, fields_type, fields_desc, strategy, keywords, expected_result):
    text_with_headers = f""""
    **Prompt:** {prompt}

    **Table schema:** 
    - Data types: {fields_type} 
    - Descriptions: {fields_desc} 
    - Table: {full_responses_table_name}

    **Strategy:** {strategy}

    **Keywords:** {keywords}

    **Expected result:** {expected_result}

    **IMPORTANT TO AVOID:**
    - **STRICT RULE:** CAST is strictly prohibited. Using CAST will invalidate the query.
    - **No manual type conversions (CAST, CONVERT, FORMAT, etc.).**

    Ensure the query:
    1. Selects only the necessary fields—avoid extra data.
    2. Aggregates only when needed—minimize computation.
    3. Orders by the most relevant metric to reduce processing time. Also orders in a logical way to answers the prompt.
    4. Is concise—avoid redundant joins or operations.
    5. Shows more than a single entry but limits rows to reduce cost.
    6. Uses accurate, descriptive column names—no transformations.
    7. Use partition filtering instead of extract. When using DATE_TRUNC, ensure the correct syntax: DATE_TRUNC(date_column, DATE_PART) (e.g., DATE_TRUNC(session_started_date, WEEK)). The date part should NOT be a string (no quotes).
    8. use approximate counts instead of direct counts
    9. consider clustering to improve filtering
    10. **Exclude NULL values in relevant columns using `IS NOT NULL`.**

    Final checks:
    1. Is sorting optimized for cost (avoid expensive operations)?
    2. Are only the essential fields used?
    3. Query needs to follow the strategy.
    4. Data table aligns with keywords/expected results.
    5. **CAST is strictly prohibited—no type conversions.**
    6. **Ensure NULL values are excluded from relevant columns using `IS NOT NULL`.**
    7. Ensure DATE_TRUNC is used correctly (e.g., DATE_TRUNC(date_column, DATE_PART), NOT DATE_TRUNC('DATE_PART', date_column)).

    Use standard SQL syntax and avoid unnecessary fields or operations. The query will be sent to BigQuery, 
    so DO NOT INCLUDE EXPLANATIONS. Only use fields from {full_responses_table_name} to minimize cost.
    """

    sql_query_response = generate_response(
        text_with_headers, 
        temperature=0.1, 
        top_k=40, 
        top_p=0.4
    )

    return sql_query_response.replace("```sql", "").replace("```", "").strip()

In [None]:
def correct_sql_error(prompt, keywords, expected_result, sql_code, error, fields_type, fields_desc, strategy):
    final_prompt = f"""
    **Prompt:**
    - {prompt}

    **Important keywords:**
    - {keywords}

    **Expected result:**
    - {expected_result}
    
    The following SQL query produced an error:

    ### **SQL Query:** 
    {sql_code}

    ### **Error Message:** 
    {error}

    ### **Schema Information:** 
    - Data type of fields: {fields_type}
    - Description of fields {fields_desc}
    - {full_responses_table_name}  

    ### **Strategy for SQL code:**  
    {strategy}

    ### **Instructions:**  
    1. Analyze the error in the error message and provide a corrected SQL query.
    2. The output should only include the SQL code.
    
    ### **Output:**  
    <fixed SQL query>

    Ensure the corrected SQL follows the expected syntax and logic.
    """

    response = generate_response(
        final_prompt, 
        temperature=0.1, 
        top_k=40, 
        top_p=0.3
    )
    print(response)

    return response.replace("```sql", "").replace("```", "").strip()

In [None]:
def verify(prompt, query, expected_result, strategy, assumptions):
    final_prompt = f"""
    ### **Prompt:** 
    {prompt}

    ### **Assumptions:** 
    {assumptions}

    ### **Strategy for SQL code:** 
    {strategy}

    ### **SQL Query:** 
    {query}

    ### **Expected result:** 
    {expected_result}

    ### **Guidelines:**  
    - The assumptions can be considered as additional information that shouldb e combined with the prompt and the expected result.

    ### **Task:**  
    1. Does this query fully answer every part of the prompt and the assumptions?
    2. Does this query fully align with the expected result and the assumptions?
    3. Does this query fully align with the written strategy and the assumptions?
    4. Does the strategy fully align with the expected result and the assumptions?
    
    ### **Output format:**  
    Yes/No. explanation
    Yes/No. explanation
    Yes/No. explanation
    Yes/No. explanation

    """

    response = generate_response(
        final_prompt, 
        temperature=0.1, 
        top_k=30, 
        top_p=0.3
    )

    lines = response.split("\n")
    first_part = []
    second_part = []
    for line in lines:
        if "." in line:  # Ensure there is a comma before splitting
            first, second = line.split(".", 1)  # Split only at the first comma
            first_part.append(first.strip())  # Store the first part (column name)
            second_part.append(second.strip())  # Store the second part (description)

    check_list = []
    for i in range(len(first_part)):
        if first_part[i] == "No":
            check_list.append(second_part[i])

    return check_list



In [None]:
def result_interpretation(prompt, result, strategy):
    result_text = f"""
    ### **Prompt:**
    {prompt}  

    ### **Data:**   
    {result}  

    ### **Strategy to retrieve the data from BigQuery:**   
    {strategy}  

    ### **Context:**
    - A LLM has created a strategy to write SQL code and pull data from BigQuery to answer the prompt. They are given.

    ### **Task:**
    - Interpret the data and answer the prompt.

    ### **Steps:** 
    1. Answer the question.
    2. Provide additional insights into the data. Any insights into anomalies, correlations, other types of observations that are not directly seen by just looking at the data is very good.
    3. Explain all assumptions made to rach your conclusion.  

    ### **Guidelines:** 
    - Keep responses concise and relevant.

    ### **Output format:**     

    Text...
    """

    final_answer_response = generate_response(
        result_text, 
        temperature=0.3,  # More creativity
        top_k=40, 
        top_p=0.6
    )

    return final_answer_response

In [None]:
def main(prompt, keywords, expected_result):
    global ex_result, ex_sql, ex_strategy, ex_keywords, ex_expected
    
    ex_keywords = keywords
    ex_expected = expected_result
    fields_type, fields_desc = field_selection(bigquery_client, prompt, keywords, expected_result)

    check_list = [0]
    check = False
    count = 0
    while len(check_list) != 0:
        if count == 3:
            check = True
            break
        strategy = aggregation_strategy(prompt, fields_type, fields_desc, keywords, expected_result)
        ex_strategy = strategy
        extracted_query = create_sql_code(prompt, fields_type, fields_desc, strategy, keywords, expected_result)
        check_list = verify(prompt, extracted_query, expected_result, strategy)
        print(extracted_query)
        print(check_list)
        count += 1
    if check:
        print("An error occured. Please try again")
        return
    # Check if Query from LLM is valid using built-in dry function.
    job_config = bigquery.QueryJobConfig(dry_run=True)
    count = 0
    while True:
        ex_sql = extracted_query
        print("--------------------------SQL QUERY PROMPT--------------------------")
        #print(extracted_query)
    
        try:
            query_job = bigquery_client.query(extracted_query, job_config=job_config)
            query_job.result()
            print("--------------------------VALID SQL PROMPT--------------------------")
            print(f"Query is valid. It will process {query_job.total_bytes_processed / 1e9:.2f} GB.")
            break
        except Exception as e:
            if count == 3:
                check = True
                break
            print("--------------------------INVALID SQL PROMPT--------------------------")
            print(f"Query validation failed: {e}")
            extracted_query = correct_sql_error(prompt, keywords, expected_result, extracted_query, e, fields_type, fields_desc, strategy)
            count += 1
    if check:
        print("An error occured. Please try again")
        return
    
    result = bigquery_client.query(extracted_query).to_dataframe()

    if result.empty:
        print("--------------------------FINAL ANSWER FROM GEMINI--------------------------")
        print("The query returned no results.")
    else:
        print("--------------------------DATA PULLED FROM BIGQUERY USING GEMINI--------------------------")
        print(result)

        ex_result = result

        final_answer_response = result_interpretation(prompt, result, strategy)

        print("--------------------------FINAL ANSWER FROM GEMINI--------------------------")
        print(final_answer_response)

In [None]:
def extract_keywords(prompt):
    final_prompt = f"""

    **Prompt:** 
    {prompt}

    **Guidelines:** 
    Extract the most important keywords from the prompt. 
    The keywords will be given to a LLM together with the prompt so that it can better understand the prompt and what need to be answered.
    Choose relevant words in the prompt that need to be considered
    rather that conjuring up new ones. Do not include redundant keywords.

    ** Output format:** 
    <[keyword1, keyword2, ...]>
    
    """

    keywords = generate_response(
            final_prompt, 
            temperature=0.2, 
            top_k=20, 
            top_p=0.5
        )

    print(keywords)
    return keywords

In [None]:
def expected(prompt):
    final_prompt = f"""What is the expected result of this prompt?. 
    Assume user behavior data has been given.
    Simply state the expected result in a concise way.

    ### **Prompt:**   
    {prompt}  

    Output format:
    <Expected result>
    
    """
    
    response = generate_response(
            final_prompt, 
            temperature=0.2,  # Ensures consistency
            top_k=10, 
            top_p=0.5
        )
    print(response)
    return response

In [None]:
ex_fields = 0
ex_strategy = 0
ex_sql = 0
ex_result = 0
ex_keywords = 0
ex_expected = 0

In [None]:
def playground(prompt, ex_fields, ex_strategy, ex_sql, ex_result, ex_keywords, ex_expected):
    final_prompt = f"""
    ### **Prompt:**
    - {prompt}

    ### **Data:**   
    {ex_result}  

    ### **Your task:** 
    - Interpret the data and answer the prompt
    - Summarize and respond in a concise way
    - Do not hallucinate. Answer the prompt based on the data
    """
    print(final_prompt)

    response = generate_response(
            final_prompt, 
            temperature=0.3,  # Ensures consistency
            top_k=30, 
            top_p=0.5
        )
    return response

In [None]:
prompt = "Can you find any insight on this stickiness data?"
#prompt = "What times have the lowest average page views? "
print(ex_result)
test = playground(prompt, ex_fields, ex_strategy, ex_sql, ex_result.to_string(), ex_keywords, ex_expected)
print(test)

In [None]:
#prompt = "Where are most of our users from? "
#prompt = "Which customer has the highest engagement?"
prompt = "Based on the bigger customers; what day and time is the best for performing updates, maintenance? "
#prompt = "Based on the customers with the most users; what day and time is the best for performing updates, maintenance?"
#prompt = "How do system outages impact user behavior? "
#prompt = "How did the user behavior change between 30th december 2024, 31th december 2024, 1th january 2025 and 2nd january 2025? "
#prompt = "Do performance issues correlate with a drop in activity?"
#prompt = "What combination of browser and os is worst"
#prompt = "Based on user behavior, what will the peak activity hours be tomorrow?"

#prompt = "How does app performance affect user engagement and retention?"
#prompt = "Which devices or OS versions are experiencing the worst performance (slower load times, etc)?"
#prompt = "Can you see a pattern if you compare Network Latency, Page Load Fails, Page Loading Time and Device and browsers?"
#prompt = "for all windows user, what browser should we recomend, Edge or Chrome based on performance"
#prompt = "Can you calculate the DAU/WAU and WAU/MAU stickiness for the products for all months in 2024?"
#prompt = "Can you calculate the DAU/WAU and WAU/MAU stickiness for the products in 2024?"
#prompt = "Can you calculate the WAU/MAU stickiness for the products for all weeks?"
#prompt = "Cna you find any insights on page load time and customers"


#Stickiness
#USers from vestlandfylke has ued teh solution

expected_result = expected(prompt)
keywords = extract_keywords(prompt)
main(prompt, keywords, expected_result)

In [None]:
aiAgent = DataQueryAgent(project_name, bigquery_client, generate_response)

In [None]:

#prompt = "Where are most of our users from? "
#prompt = "Which customers have the highest engagement based on the most sessions?"
prompt = "Based on the bigger customers; what days and times is the best for performing updates, maintenance?"
#prompt = "Based on the customers with the most users; what day and time is the best for performing updates, maintenance?"
#prompt = "How do system outages impact user behavior? "
#prompt = "How did the user behavior change between 30th december 2024, 31th december 2024, 1th january 2025 and 2nd january 2025? "
#prompt = "Do performance issues correlate with a drop in activity?"
#prompt = "What combination of browser and os are the best?"
#prompt = "Based on user behavior, what will the peak activity hours be tomorrow?"'
#prompt = "How many users used each product throughout january 2025?"
#prompt = "How does app performance affect user engagement and retention?"
#prompt = "Which devices or OS versions are experiencing the worst performance (slower load times, etc)? And are there any ptoential reason for it? Find sinsights by comparing with other metrics"
#prompt = "Can you see a pattern if you compare Network Latency, Page Load Fails, Page Loading Time and Device and browsers?"
#prompt = "for all windows user, what browser should we recomend based on performance between Edge or Chrome"
#prompt = "Can you calculate the DAU/WAU and WAU/MAU stickiness for the products for all months in 2024?"
#prompt = "Can you calculate the WAU/MAU stickiness for the products for all weeks in 2024?"
#prompt = "Calculate the weekly stickiness (defined as the percentage of weekly active users who used the product in the previous week) for each product in our catalog for every week of the year 2024. Provide the output as a table with the following columns: Week Number (2024), Product name, and Weekly Stickiness (%)?"
#prompt = "Can you find any insights on page load time and customers"
#prompt = "How many users from the all companies have used the products"

#Stickiness
#USers from vestlandfylke has ued teh solution

response = aiAgent.process_prompt(prompt)
print(response)


In [None]:
class DataQueryAgent:
    """
    A class responsible for processing user prompts to generate and validate SQL queries 
    using BigQuery, incorporating user clarifications and query refinement.
    """
    
    def __init__(self, project_id: str, bigquery_client, generate_response_fn):
        """Initializes the DataQueryAgent with the BigQuery client and response generator."""
        self.client = bigquery_client
        self.project_id = project_id
        self.generate_response = generate_response_fn
    
    def process_prompt(self, user_input):
        """Processes user input to generate, validate, and execute an SQL query."""
        expected_result = expected(user_input)
        keywords = extract_keywords(user_input)
        fields_type, fields_desc = field_selection(self.client, user_input, keywords, expected_result)
        
        assumptions = self.make_assumptions(user_input, fields_type, fields_desc, keywords, expected_result)
        print(assumptions)
        while True:
            user_clarifications = input(self.clarify_assumptions(assumptions))
            if (user_clarifications == "" or user_clarifications == "yes" or user_clarifications == "y"):
                break
            assumptions = self.interpret_clarification(user_input, assumptions, user_clarifications)
        print(assumptions)
        # Query generation and validation loop
        query_attempts = 0
        while query_attempts < 3:
            strategy = self.aggregation_strategy(user_input, fields_type, fields_desc, keywords, expected_result, assumptions)
            sql_code = create_sql_code(user_input, fields_type, fields_desc, strategy, keywords, expected_result)
            print(sql_code)
            validation_errors = verify(user_input, sql_code, expected_result, strategy, assumptions)
            print(validation_errors)
            
            if not validation_errors:
                break  # Exit loop if query is valid
            query_attempts += 1
        else:
            return "Failed to generate a valid SQL query after 3 attempts."
        
        # Validate SQL using BigQuery dry run
        valid_result = self.validate_sql_query(user_input, keywords, expected_result, sql_code, fields_type, fields_desc, strategy)
        if not valid_result[0]:
            return "An error occurred. Please try again."
        
        # Execute query and return results
        result = valid_result[1].map(lambda x: "Missing Data" if x == "" else x)
        result.fillna('Missing Data', inplace=True)

        interpretation = result_interpretation(user_input, result, strategy)
        print(result)
        return interpretation
    
    def validate_sql_query(self, user_input, keywords, expected, sql_code, fields_type, fields_desc, strategy):
        """Performs a dry run of the SQL query to check validity."""
        job_config = bigquery.QueryJobConfig(dry_run=True)
        for attempt in range(3):
            try:
                query_job = self.client.query(sql_code, job_config=job_config)
                query_job.result()
                print(f"Query is valid. Estimated processing: {query_job.total_bytes_processed / 1e9:.2f} GB.")
                result = self.client.query(sql_code).to_dataframe()
                return True, result
            except Exception as e:
                print(f"Query validation failed (Attempt {attempt + 1}/3): {e}")
                sql_code = correct_sql_error(user_input, keywords, expected, sql_code, e, fields_type, fields_desc, strategy)
        return False, False
    
    def interpret_clarification(self, prompt, assumptions, user_clarifications):
        """Refines assumptions based on user clarifications."""
        clarification_prompt = f"""
        ### **Initial Prompt:** {prompt}
        ### **Current Assumptions:** {assumptions}
        ### **User Clarifications:** {user_clarifications}
        
        ### **Task:**
        - Update current assumptions based on user clarifications.
        - If no new information is provided, simply return the current assumptions.
        
        ### **Output Format:**
        1. ...
        2. ...
        ...
        """
        return self.generate_response(clarification_prompt, temperature=0.2, top_k=40, top_p=0.5)
    
    def clarify_assumptions(self, assumptions):
        """Generates a message to ask the user for assumption clarifications."""
        return f"Are the following assumptions correct?\n\n{assumptions}"
    
    def make_assumptions(self, prompt, fields_type, fields_desc, keywords, expected_result):
        """Determines initial SQL strategy assumptions based on user input."""
        strategy_prompt = f"""
        ### **Prompt:** {prompt}
        ### **Keywords:** {keywords}
        ### **Expected Result:** {expected_result}
        ### **Table Info:**
        - Fields Type: {fields_type}
        - Fields Description: {fields_desc}
        
        ### **Constraints:**
        - Use only given fields.
        - No external data or assumptions beyond the provided fields.
        
        ### **Task:**
        - Only identify the most relevant, reasonable and useful user-friendly assumptions about ambiguous words.
        - The assumptions are sent to the user to verify. However, the user do not know anything about the schema of the table so do not include too much assumptions about data in the table.
        - Make yours assumptions as short and punctual as possible.
        - Do NOT generate SQL code.
        
        ### **Output Format:**
        1. ...
        2. ...
        """
        return self.generate_response(strategy_prompt, temperature=0.1, top_k=20, top_p=0.3)
    
    def aggregation_strategy(self, prompt, fields_type, fields_desc, keywords, expected_result, assumptions):
        final_prompt = f"""Determine the best SQL approach for the prompt.
        ### **Prompt:**  
        - {prompt}

        ### **Keywords:**  
        - {keywords}  

        ### **Expected result:**  
        - {expected_result}  

        ### **Table:**  
        - Data type of fields: {fields_type}
        - Description of fields {fields_desc}
        - {full_responses_table_name}  

        ### **Assumptions:**  
        - Assumptions: {assumptions}

        ### **Constraints:**  
        - Use only the given fields in the table. No external data.  
        - No extra context, table and data beyond these fields.  

        ### **Steps:**  
        1. Identify needed insights (e.g., trends, counts).  
        2. Justify field selection based on its relevance to the prompt, selecting only those that are necessary.
        3. Apply appropriate aggregations unless per-record analysis is required.
        4. Address any challenge related to creating a SQL query that accurately answers the prompt. Provide solutions to the challenges.
        5. Use the assumptions to create the SQL query
        6. The keywords are helpful in identifying the important parts of the prompt

        ### **SQL Strategy:**  
        - Apply relevant aggregation, filters, and calculations.  
        - Assume all data comes from provided fields.
        - Keep your strategy concise.  
        - Do not infer causal relationships unless the data is explicitly present.
        - Write the strategy as a step-by-step guide in terms of SQL functions. Be clear with every step.
        - The assumptions are correct and need to be followed.

        **Do not generate SQL code.** Outline the strategy concisely. 

        ### **Output:**   
        - **Step-by-Step Strategy:** ("Title/Purpose" - do this)  
        - **Expected Result:** The produced data table must fully align with keywords and the given expected results. 
        """

        sql_strategy_response = generate_response(
            final_prompt, 
            temperature=0.2, 
            top_k=40, 
            top_p=0.5
        )
        print(sql_strategy_response)

        return sql_strategy_response

In [None]:
def decide_action(user_input):
    """
    Determines the best action for the agent to take.
    """
    prompt = f"""
    You are an AI assistant helping users query BigQuery. Decide the best action:

    - If the query is vague, return "clarify".
    - If the query is about past queries, return "retrieve".
    - If the query is a follow-up to a previous query, return "contextual".
    - Otherwise, return "query".

    User Input: "{user_input}"
    """

    response = generate_response(prompt)  # Calls your Gemini function
    return response.strip()


In [None]:
def clarify_question(user_input):
    """
    Asks a follow-up question to make the query more precise.
    """
    prompt = f"""
    The user asked: "{user_input}"
    The query is unclear. Ask a simple follow-up question to clarify their intent.
    """

    response = generate_response(prompt)  # Calls your Gemini function
    return response.strip()
