In [1]:
from vanna.openai import OpenAI_Chat
from vanna.pgvector import PG_VectorStore

from langchain_openai import OpenAIEmbeddings

## Sql Generation

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

class DalgoVannaClient(PG_VectorStore, OpenAI_Chat):
    def __init__(self, openai_config={}, pg_vector_config={}):
        PG_VectorStore.__init__(
            self,
            config={
                "connection_string": "postgresql+psycopg://{username}:{password}@{server}:{port}/{database}".format(
                    **{
                        "username": os.environ["PGVECTOR_USER"],
                        "password": os.environ["PGVECTOR_PASSWORD"],
                        "server": os.environ["PGVECTOR_HOST"],
                        "port": os.environ["PGVECTOR_PORT"],
                        "database": os.environ["PGVECTOR_DB"],
                    }
                ),
                **pg_vector_config,
            },
        )
        OpenAI_Chat.__init__(
            self,
            config={
                "api_key": os.environ["OPENAI_API_KEY"],
                "model": "gpt-4o-mini",
                **openai_config,
            },
        )


In [5]:
vn_client = DalgoVannaClient(
    openai_config={
        "initial_prompt": "Please qualify all table names with their schema names in the generated SQL"
    },
    pg_vector_config={"embedding_function": OpenAIEmbeddings()},
)

In [6]:
vn_client.connect_to_postgres(
    host=os.environ["WAREHOUSE_HOST"],
    dbname=os.environ["WAREHOUSE_DBNAME"],
    user=os.environ["WAREHOUSE_USER"],
    password=os.environ["WAREHOUSE_PASSWORD"],
    port=os.environ["WAREHOUSE_PORT"]
)

In [10]:


# traning plan
exclude_schemas = ["airbyte_internal", "dbt_staging_elementary"]

quote_schema = lambda schema: f"'{schema}'"

train_df = vn_client.run_sql(f"""SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE \
                             table_schema not in (\
                             {','.join([quote_schema(schema) for schema in exclude_schemas])}\
                            ) """)

In [11]:
train_df

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,cityfinance,pg_catalog,pg_type,oid,1,,NO,oid,,,...,NO,,,,,,NO,NEVER,,YES
1,cityfinance,pg_catalog,pg_proc,proleakproof,12,,NO,boolean,,,...,NO,,,,,,NO,NEVER,,YES
2,cityfinance,pg_catalog,pg_proc,proisstrict,13,,NO,boolean,,,...,NO,,,,,,NO,NEVER,,YES
3,cityfinance,pg_catalog,pg_proc,proretset,14,,NO,boolean,,,...,NO,,,,,,NO,NEVER,,YES
4,cityfinance,pg_catalog,pg_proc,provolatile,15,,NO,"""char""",,,...,NO,,,,,,NO,NEVER,,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1811,cityfinance,information_schema,routine_sequence_usage,routine_name,6,,YES,name,,,...,NO,,,,,,NO,NEVER,,NO
1812,cityfinance,information_schema,routine_sequence_usage,sequence_catalog,7,,YES,name,,,...,NO,,,,,,NO,NEVER,,NO
1813,cityfinance,pg_catalog,pg_available_extension_versions,name,1,,YES,name,,,...,NO,,,,,,NO,NEVER,,NO
1814,cityfinance,information_schema,routine_sequence_usage,sequence_schema,8,,YES,name,,,...,NO,,,,,,NO,NEVER,,NO


In [12]:
training_plan = vn_client.get_training_plan_generic(df=train_df)

In [14]:
vn_client.train(plan=training_plan)

### Janaagraha

In [None]:
chart = [{"chart_name": "City Eligibility", "schema": "mongo_staging", "table": "grants_available"}]

user_question = "Give me the % eligibility across panchayats, municipalities and area councils"

vanna_query = f"""
The user is looking at a chart with the following details:
- Chart Name: {chart[0]['chart_name']}
- Schema: {chart[0]['schema']}
- Table: {chart[0]['table']}

The user is asking the following question: 
- {user_question}

Answer maybe or not be available in the context present
"""


vn_client.generate_sql(question=vanna_query, allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': "Please qualify all table names with their schema names in the generated SQL\n===Additional Context \n\nThe following columns are in the schemata table in the cityfinance database:\n\n|      | table_catalog   | table_schema       | table_name   | column_name                   | data_type         |\n|-----:|:----------------|:-------------------|:-------------|:------------------------------|:------------------|\n| 1702 | cityfinance     | information_schema | schemata     | catalog_name                  | name              |\n| 1703 | cityfinance     | information_schema | schemata     | schema_name                   | name              |\n| 1704 | cityfinance     | information_schema | schemata     | schema_owner                  | name              |\n| 1705 | cityfinance     | information_schema | schemata     | default_character_set_catalog | name              |\n| 1708 | cityfinance     | information_schema | schemata     | default_charac

'The provided context does not contain any information about the `mongo_staging.grants_available` table or its columns. Therefore, I cannot generate a SQL query to determine the percentage of eligibility across panchayats, municipalities, and area councils. Please provide additional context or information about the structure of the `grants_available` table in the `mongo_staging` schema.'