In [64]:
import os
from dotenv import load_dotenv

In [None]:
from vanna.openai import OpenAI_Chat
from vanna.pgvector import PG_VectorStore

In [68]:
load_dotenv("cwyd.inrem.env", override=True)

True

In [69]:
MY_VANNA_API_KEY = os.environ["MY_VANNA_API_KEY"]
MY_VANNA_MODEL = os.environ["MY_VANNA_MODEL"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PATH_TO_BQ_SERVICE_ACCOUNT_JSON = os.environ["PATH_TO_BQ_SERVICE_ACCOUNT_JSON"]
BQ_PROJECT_ID = os.environ["BQ_PROJECT_ID"]
BQ_DATASET = os.environ["BQ_DATASET"]

PGVECTOR_USER = os.environ["PGVECTOR_USER"]
PGVECTOR_PASSWORD = os.environ["PGVECTOR_PASSWORD"]
PGVECTOR_HOST = os.environ["PGVECTOR_HOST"]
PGVECTOR_PORT = os.environ["PGVECTOR_PORT"]
PGVECTOR_DB = os.environ["PGVECTOR_DB"]

In [71]:
class DalgoVannaClient(PG_VectorStore, OpenAI_Chat):
    def __init__(self, openai_config={}, pg_vector_config={}):
        PG_VectorStore.__init__(
            self,
            config={
                "connection_string": "postgresql+psycopg://{username}:{password}@{server}:{port}/{database}".format(
                    **{
                        "username": PGVECTOR_USER,
                        "password": PGVECTOR_PASSWORD,
                        "server": PGVECTOR_HOST,
                        "port": PGVECTOR_PORT,
                        "database": PGVECTOR_DB,
                    }
                ),
                **pg_vector_config,
            },
        )
        OpenAI_Chat.__init__(
            self,
            config={
                "api_key": OPENAI_API_KEY,
                "model": "gpt-4o-mini",
                **openai_config,
            },
        )

vn = DalgoVannaClient()

In [72]:
vn.connect_to_bigquery(
    project_id=BQ_PROJECT_ID,
    cred_file_path=PATH_TO_BQ_SERVICE_ACCOUNT_JSON,
)

Not using Google Colab.


In [73]:
exclude_schemas = ["airbyte_internal", "dbt_staging_elementary", "pg_catalog"]

In [74]:
print(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["`" + schema + "`" for schema in exclude_schemas])}
)
""")


SELECT * FROM `intermediate_frappe`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    `airbyte_internal`,`dbt_staging_elementary`,`pg_catalog`
)



In [75]:
df_information_schema = vn.run_sql(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["'" + schema + "'" for schema in exclude_schemas])}
)
""")

In [76]:
df_information_schema

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,is_nullable,data_type,is_generated,generation_expression,is_stored,is_hidden,is_updatable,is_system_defined,is_partitioning_column,clustering_ordinal_position,collation_name,column_default,rounding_mode
0,dalgo-412710,intermediate_frappe,envquality_prod,villageentity_id,1,YES,STRING,NEVER,,,NO,,NO,NO,,,,
1,dalgo-412710,intermediate_frappe,envquality_prod,reporting_date,2,YES,STRING,NEVER,,,NO,,NO,NO,,,,
2,dalgo-412710,intermediate_frappe,envquality_prod,reporting_year,3,YES,INT64,NEVER,,,NO,,NO,NO,,,,
3,dalgo-412710,intermediate_frappe,envquality_prod,reporting_month,4,YES,INT64,NEVER,,,NO,,NO,NO,,,,
4,dalgo-412710,intermediate_frappe,envquality_prod,reporting_year_month,5,YES,STRING,NEVER,,,NO,,NO,NO,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,dalgo-412710,intermediate_frappe,behaviour_prod,villagename,10,YES,STRING,NEVER,,,NO,,NO,NO,,,,
162,dalgo-412710,intermediate_frappe,behaviour_prod,statename,11,YES,STRING,NEVER,,,NO,,NO,NO,,,,
163,dalgo-412710,intermediate_frappe,behaviour_prod,districtname,12,YES,STRING,NEVER,,,NO,,NO,NO,,,,
164,dalgo-412710,intermediate_frappe,behaviour_prod,blockname,13,YES,STRING,NEVER,,,NO,,NO,NO,,,,


In [77]:
plan = vn.get_training_plan_generic(df_information_schema)
vn.train(plan=plan)

In [78]:
vn.train(documentation="""
inrem runs water quality programs in villages around india
""")
vn.train(documentation="""
examples "village entities" are schools, water pumps, agricultural land, and households
""")


'bffad8e8-f2ea-44c1-b5a8-195ad6b5ff1d-doc'

In [79]:
questions = [
    "what types of contaminants are measured in water quality programs?",
    "which villages have the highest levels of iron?",
    "which village entities have the highest levels of iron?",
]

In [80]:
response = vn.generate_sql(question=questions[0], allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': 'You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\ninrem runs water quality programs in villages around india\n\n\n\nexamples "village entities" are schools, water pumps, agricultural land, and households\n\n\nThe following columns are in the envqualityreporttable table in the dalgo-412710 database:\n\n|    | table_catalog   | table_schema        | table_name            | column_name                 | data_type   |\n|---:|:----------------|:--------------------|:----------------------|:----------------------------|:------------|\n| 74 | dalgo-412710    | intermediate_frappe | envqualityreporttable | envqualityreporttable_id    | STRING      |\n| 75 | dalgo-412710    | intermediate_frappe | envqualityreporttable | envqualityreport_id         | STRING      |\n| 

In [56]:
print(response)

SELECT DISTINCT environmental_parameter 
FROM `dalgo-412710.intermediate_frappe.envqualityreporttable`;


In [58]:
response = vn.generate_sql(question=questions[1], allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': 'You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\ninrem runs water quality programs in villages around india\n\n\n\nexamples "village entities" are schools, water pumps, agricultural land, and households\n\n\nThe following columns are in the village table in the dalgo-412710 database:\n\n|     | table_catalog   | table_schema        | table_name   | column_name   | data_type   |\n|----:|:----------------|:--------------------|:-------------|:--------------|:------------|\n| 129 | dalgo-412710    | intermediate_frappe | village      | village_id    | STRING      |\n| 130 | dalgo-412710    | intermediate_frappe | village      | name          | STRING      |\n| 131 | dalgo-412710    | intermediate_frappe | village      | state         | STRING      |\n| 132 | da

In [60]:
response = vn.generate_sql(question=questions[2], allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': 'You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\nexamples "village entities" are schools, water pumps, agricultural land, and households\n\n\n\ninrem runs water quality programs in villages around india\n\n\nThe following columns are in the village_to_village_entity table in the dalgo-412710 database:\n\n|     | table_catalog   | table_schema        | table_name                | column_name      | data_type   |\n|----:|:----------------|:--------------------|:--------------------------|:-----------------|:------------|\n| 126 | dalgo-412710    | intermediate_frappe | village_to_village_entity | villagetable_id  | STRING      |\n| 127 | dalgo-412710    | intermediate_frappe | village_to_village_entity | village_id       | STRING      |\n| 128 | dalgo-412710  