In [50]:
import os
from dotenv import load_dotenv

In [10]:
from vanna.openai import OpenAI_Chat
from vanna.vannadb import VannaDB_VectorStore

In [66]:
load_dotenv("cwyd.antarang.env")

True

In [67]:
MY_VANNA_API_KEY = os.environ["MY_VANNA_API_KEY"]
MY_VANNA_MODEL = os.environ["MY_VANNA_MODEL"]
OPENAI_API_Key = os.environ["OPENAI_API_KEY"]
PATH_TO_BQ_SERVICE_ACCOUNT_JSON = os.environ["PATH_TO_BQ_SERVICE_ACCOUNT_JSON"]
BQ_PROJECT_ID = os.environ["BQ_PROJECT_ID"]
BQ_DATASET = os.environ["BQ_DATASET"]

In [61]:
class MyVanna(VannaDB_VectorStore, OpenAI_Chat):
    def __init__(self, config=None):
        VannaDB_VectorStore.__init__(self, vanna_model=MY_VANNA_MODEL, vanna_api_key=MY_VANNA_API_KEY, config=config)
        OpenAI_Chat.__init__(self, config=config)

vn = MyVanna(config={'api_key': OPENAI_API_Key, 'model': 'gpt-4o'})

In [65]:
vn.connect_to_bigquery(
    project_id=BQ_PROJECT_ID,
    cred_file_path=PATH_TO_BQ_SERVICE_ACCOUNT_JSON,
)

Not using Google Colab.


In [31]:
exclude_schemas = ["airbyte_internal", "dbt_staging_elementary", "pg_catalog"]

In [68]:
print(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["`" + schema + "`" for schema in exclude_schemas])}
)
""")


SELECT * FROM `dalgo_DBT_Antarang_Foundation`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    `airbyte_internal`,`dbt_staging_elementary`,`pg_catalog`
)



In [69]:
df_information_schema = vn.run_sql(f"""
SELECT * FROM `{BQ_DATASET}`.`INFORMATION_SCHEMA`.`COLUMNS` 
WHERE column_name NOT LIKE '_airbyte%'
AND table_schema not in (
    {','.join(["'" + schema + "'" for schema in exclude_schemas])}
)
""")

In [70]:
df_information_schema

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,is_nullable,data_type,is_generated,generation_expression,is_stored,is_hidden,is_updatable,is_system_defined,is_partitioning_column,clustering_ordinal_position,collation_name,column_default,rounding_mode
0,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_no,1,YES,STRING,NEVER,,,NO,,NO,NO,,,,
1,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_academic_year,2,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
2,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_language,3,YES,STRING,NEVER,,,NO,,NO,NO,,,,
3,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,batch_donor,4,YES,STRING,NEVER,,,NO,,NO,NO,,,,
4,antarang-dashboard,dalgo_DBT_Antarang_Foundation,int_global_session_attendance_copy,facilitator_name,5,YES,STRING,NEVER,,,NO,,NO,NO,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7949,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,fp_correct,25,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7950,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_Received,26,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7951,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_raw,27,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,
7952,antarang-dashboard,dalgo_DBT_Antarang_Foundation,ygMUM10,saf_correct,28,YES,NUMERIC,NEVER,,,NO,,NO,NO,,,,


In [35]:
plan = vn.get_training_plan_generic(df_information_schema)
vn.train(plan=plan)

In [36]:
vn.train(documentation="""
antarang runs after-school programs for disadvantaged youth in indian cities
""")
vn.train(documentation="""
antarang does skills development and career counseling for their students
""")


Adding documentation....
Adding documentation....


'3099115-doc'

In [42]:
questions = [
    "what are the most popular programs which children enroll in?",
    "what are the different aspects on which children are mentored and graded?",
]

In [43]:
response = vn.generate_sql(question=questions[1], allow_llm_to_see_data=False)

SQL Prompt: [{'role': 'system', 'content': "You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\nantarang does skills development and career counseling for their students\n\n\n\nantarang does skills development and career counseling for their students\n\n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\nThe following columns are in the stg_assessment_marks table in the antarang-dashboard database:\n\n|     | table_catalog      | table_schema                  | table_name           | column_name                 | data_type   |\n|----:|:-------------------|:------------------------------|:---------------------|:----------------------------|:------------|\n| 955 | antarang-dashboard | dalgo

In [44]:
print(response)

The LLM is not allowed to see the data in your database. Your question requires database introspection to generate the necessary SQL. Please set allow_llm_to_see_data=True to enable this.


In [45]:
questions[1]

'what are the different aspects on which children are mentored and graded?'

In [46]:
response = vn.generate_sql(question=questions[1], allow_llm_to_see_data=True)

SQL Prompt: [{'role': 'system', 'content': "You are a BigQuery SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\n\nantarang does skills development and career counseling for their students\n\n\n\nantarang does skills development and career counseling for their students\n\n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\n\nantarang runs after-school programs for disadvantaged youth in indian cities\n\n\nThe following columns are in the stg_assessment_marks table in the antarang-dashboard database:\n\n|     | table_catalog      | table_schema                  | table_name           | column_name                 | data_type   |\n|----:|:-------------------|:------------------------------|:---------------------|:----------------------------|:------------|\n| 955 | antarang-dashboard | dalgo

In [48]:
print(response)

SELECT DISTINCT assessment_record_type 
FROM `antarang-dashboard.dalgo_DBT_Antarang_Foundation.stg_assessment_marks`;
