In [71]:
import os
from dotenv import load_dotenv

In [86]:
from vanna.openai import OpenAI_Chat
from vanna.pgvector import PG_VectorStore

In [89]:
load_dotenv("cwyd.antarang.env", override=True)

True

In [90]:
MY_VANNA_API_KEY = os.environ["MY_VANNA_API_KEY"]
MY_VANNA_MODEL = os.environ["MY_VANNA_MODEL"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PATH_TO_BQ_SERVICE_ACCOUNT_JSON = os.environ["PATH_TO_BQ_SERVICE_ACCOUNT_JSON"]
BQ_PROJECT_ID = os.environ["BQ_PROJECT_ID"]
BQ_DATASET = os.environ["BQ_DATASET"]

PGVECTOR_USER = os.environ["PGVECTOR_USER"]
PGVECTOR_PASSWORD = os.environ["PGVECTOR_PASSWORD"]
PGVECTOR_HOST = os.environ["PGVECTOR_HOST"]
PGVECTOR_PORT = os.environ["PGVECTOR_PORT"]
PGVECTOR_DB = os.environ["PGVECTOR_DB"]

In [91]:
class DalgoVannaClient(PG_VectorStore, OpenAI_Chat):
    def __init__(self, openai_config={}, pg_vector_config={}):
        PG_VectorStore.__init__(
            self,
            config={
                "connection_string": "postgresql+psycopg://{username}:{password}@{server}:{port}/{database}".format(
                    **{
                        "username": PGVECTOR_USER,
                        "password": PGVECTOR_PASSWORD,
                        "server": PGVECTOR_HOST,
                        "port": PGVECTOR_PORT,
                        "database": PGVECTOR_DB,
                    }
                ),
                **pg_vector_config,
            },
        )
        OpenAI_Chat.__init__(
            self,
            config={
                "api_key": OPENAI_API_KEY,
                "model": "gpt-4o-mini",
                **openai_config,
            },
        )

vn = DalgoVannaClient()

In [92]:
vn.connect_to_bigquery(
    project_id=BQ_PROJECT_ID,
    cred_file_path=PATH_TO_BQ_SERVICE_ACCOUNT_JSON,
)

Not using Google Colab.


In [93]:
exclude_schemas = ["airbyte_internal", "dbt_staging_elementary", "pg_catalog"]

In [94]:
print(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["`" + schema + "`" for schema in exclude_schemas])}
)
""")


SELECT * FROM `dalgo_DBT_Antarang_Foundation`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    `airbyte_internal`,`dbt_staging_elementary`,`pg_catalog`
)



In [95]:
df_information_schema = vn.run_sql(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["'" + schema + "'" for schema in exclude_schemas])}
)
""")

In [96]:
df_information_schema

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,is_nullable,data_type,is_generated,generation_expression,is_stored,is_hidden,is_updatable,is_system_defined,is_partitioning_column,clustering_ordinal_position,collation_name,column_default,rounding_mode
0,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_no,1,YES,STRING,NEVER,,,NO,,NO,NO,,,,
1,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_academic_year,2,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
2,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_language,3,YES,STRING,NEVER,,,NO,,NO,NO,,,,
3,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_donor,4,YES,STRING,NEVER,,,NO,,NO,NO,,,,
4,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,facilitator_name,5,YES,STRING,NEVER,,,NO,,NO,NO,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7948,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,fp_correct,25,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7949,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_Received,26,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7950,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_raw,27,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7951,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_correct,28,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,


In [97]:
plan = vn.get_training_plan_generic(df_information_schema)
vn.train(plan=plan)

In [98]:
vn.train(documentation="""
antarang runs after-school programs for disadvantaged youth in indian cities
""")
vn.train(documentation="""
antarang does skills development and career counseling for their students
""")


'136b0177-bae8-4fd4-b35a-25a3b2ca3b09-doc'

In [99]:
questions = [
    "what are the most popular programs which children enroll in?",
    "what are the different aspects on which children are mentored and graded?",
    "how many schools does antarang operate in?",
]

In [100]:
response = vn.generate_sql(question=questions[2], allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': "You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\n\nantarang does skills development and career counseling for their students\n\n\nThe following columns are in the ys table in the antarang-dashboard database:\n\n|      | table_catalog      | table_schema                  | table_name   | column_name   | data_type   |\n|-----:|:-------------------|:------------------------------|:-------------|:--------------|:------------|\n| 7091 | antarang-dashboard | dalgo_DBT_Antarang_Foundation | ys           | Studentcount  | INT64       |\n\nThe following columns are in the seed_district table in the antarang-dashboard database:\n\n|     | table_catalog      | table_schema                

In [101]:
print(response)

SELECT COUNT(DISTINCT school_id) AS total_schools
FROM stg_school;


In [102]:
questions[1]

'what are the different aspects on which children are mentored and graded?'

In [103]:
response = vn.generate_sql(question=questions[1], allow_llm_to_see_data=True)

SQL Prompt: [{'role': 'system', 'content': "You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\nantarang does skills development and career counseling for their students\n\n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\nThe following columns are in the student_global_assessment_status_metadata table in the antarang-dashboard database:\n\n|      | table_catalog      | table_schema                  | table_name                                | column_name     | data_type   |\n|-----:|:-------------------|:------------------------------|:------------------------------------------|:----------------|:------------|\n| 7281 | antarang-dashboard | dalgo_DBT_Antarang_Foundation | student_global_assessment_status_metadata | table_name      | STRING      |\n| 7282 | antarang-d

In [104]:
print(response)

Error running intermediate SQL: 400 Unrecognized name: column_name at [3:7]

Location: asia-south1
Job ID: 6886cf00-9e27-45c3-a9f9-64fe44618124

