In [5]:
import setup
import os

# from chartgpt.agents.agent_toolkits.bigquery.utils import get_tables_summary
from app.config.production import datasets

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [6]:
from google.oauth2 import service_account
import json
from google.cloud import bigquery


credentials = service_account.Credentials.from_service_account_info(json.loads(os.environ["GCP_SERVICE_ACCOUNT"], strict=False))
client = bigquery.Client(credentials=credentials)

In [7]:
from typing import Dict, List, Tuple, Union

from app.config import Dataset


def get_tables_summary(
        client: bigquery.Client,
        datasets: List[Dataset],
        include_types = False
) -> Dict[str, List[Dict[str, List[Union[Tuple[str, str], str]]]]]:
    # Generate tables_summary for all tables in datasets
    tables_summary = {}
    for dataset in datasets:
        dataset_id = dataset.id
        tables_summary[dataset_id] = {}
        for table_id in dataset.tables:
            table_ref = client.dataset(dataset_id).table(table_id)
            table = client.get_table(table_ref)
            tables_summary[dataset_id][table_id] = [
                (schema_field.name, schema_field.field_type) if include_types else schema_field.name
                for schema_field in table.schema
            ]
    return tables_summary

In [8]:
tables_summary = get_tables_summary(client=client, datasets=datasets, include_types=True)
str(tables_summary)

"{'metaquants_nft_finance_aggregator': {'p2p_and_p2pool_loan_data_borrow': [('transaction_hash', 'STRING'), ('block_timestamp', 'TIMESTAMP'), ('loan_id', 'STRING'), ('to_address', 'STRING'), ('from_address', 'STRING'), ('principal_amount', 'FLOAT'), ('repayment_amount', 'FLOAT'), ('erc20_address', 'STRING'), ('erc20_name', 'STRING'), ('due_date', 'TIMESTAMP'), ('duration_in_days', 'FLOAT'), ('apr', 'FLOAT'), ('token_id', 'STRING'), ('collection_address', 'STRING'), ('protocol', 'STRING'), ('amt_in_usd', 'FLOAT'), ('roll_over', 'BOOLEAN'), ('block_number', 'STRING'), ('p2p_p2pool', 'STRING')]}}"

In [9]:
from chartgpt.guardrails.applications.text2sql import Text2Sql
# from chartgpt.guardrails.validators import BugFreeBigQuerySQL

EXAMPLES = "chartgpt/guardrails/applications/examples.json"

with open(EXAMPLES, "r") as f:
    examples = json.load(f)

app = Text2Sql(
    client=client,
    sql_schema=str(tables_summary),
    examples=examples,
)

['output_schema', 'previous_response']


In [10]:
query = app("What is the average APR for the ***REMOVED*** protocol in the past 6 months")
query

SELECT AVG(apr) as average_apr 
FROM `metaquants_nft_finance_aggregator.p2p_and_p2pool_loan_data_borrow` 
WHERE LOWER(protocol) = LOWER('***REMOVED***') 
AND block_timestamp >= TIMESTAMP(DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH))
None


AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
app.guard.base_prompt

"\nHere's schema about the database that you can use to generate the GoogleSQL query.\nTry to avoid using joins if the data can be retrieved from the same table.\n\n{db_info}\n\nI will give you a list of examples. Write a GoogleSQL query similar to the examples below:\n\n{examples}\n\nINSTRUCTIONS:\n---------\n{nl_instruction}\n\nQUERY:\n---------\n"

In [8]:
query_job = client.query(query)

for row in query_job:
    print(row)

BadRequest: 400 Syntax error: Expected end of input but got "{" at [1:1]

Location: US
Job ID: 6c5cc2b3-66b5-4cba-a6a3-97c5c3ad351d


In [3]:
import openai
import os

from pydantic import BaseModel
from rich import print
from typing import List

import guardrails as gd


class Analytics(BaseModel):
    """
    Analytics GoogleSQL query and Python code to execute it
    
    """

    sql_query: str
    python_code: str

guard = gd.Guard.from_pydantic(Analytics, prompt="What is the average APR for the ***REMOVED*** protocol in the past 6 months")

raw_llm_output, validated_output = guard(
    openai.ChatCompletion.create,
    model="gpt-4-0613",
    max_tokens=1024,
    temperature=0.0,
)

In [4]:
validated_output

{'sql_query': 'SELECT AVG(APR) as average_APR FROM ***REMOVED***_protocol WHERE date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH)',
 'python_code': 'import pandas as pd\nimport mysql.connector\n\n# Establish a connection to the database\nmydb = mysql.connector.connect(host=\'localhost\', user=\'root\', password=\'password\', database=\'database\')\n\n# Create a cursor object\nmycursor = mydb.cursor()\n\n# Execute the SQL query\nmycursor.execute("SELECT AVG(APR) as average_APR FROM ***REMOVED***_protocol WHERE date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH)")\n\n# Fetch all the rows\nrows = mycursor.fetchall()\n\n# Convert the result into a pandas dataframe\ndf = pd.DataFrame(rows, columns=[\'average_APR\'])\n\n# Get the average APR\naverage_APR = df[\'average_APR\'].mean()\n\n# Close the database connection\nmydb.close()\n\nreturn average_APR'}

In [None]:
rail_str = """
<rail version="0.1">

<output>
    <pythoncode
        name="python_code"
        format="bug-free-python"
        on-fail-bug-free-python="reask"
    />
</output>


<prompt>
Given a data analysis query, write a short Python code snippet that answers the query using typical Python data analysis libraries.

The Python code snippet should return a Pandas DataFrame or show an appropriate Plotly chart.

You have access to an authenticated BigQuery client object named `client`.

BigQuery Tables Summary:
{{tables_summary}}

Data Analysis Query:
{{data_analysis_query}}

@complete_json_suffix</prompt>

</rail>
"""

In [None]:
import guardrails as gd

from rich import print

guard = gd.Guard.from_rail_string(rail_str)

In [None]:
print(guard.base_prompt)

In [None]:
import openai

data_analysis_query = """
Create sample data for a Lorenz Attractor system and plot the results.
"""

raw_llm_response, validated_response = guard(
    openai.Completion.create,
    prompt_params={
        "tables_summary": tables_summary,
        "data_analysis_query": data_analysis_query
    },
    engine="text-davinci-003",
    max_tokens=2048,
    temperature=0.5,
)

In [None]:
print(raw_llm_response)

SyntaxError: unterminated string literal (detected at line 1) (1394353013.py, line 1)

In [None]:
print(validated_response)

In [None]:
print(validated_response["python_code"])

TypeError: 'NoneType' object is not subscriptable

In [None]:
try:
    exec(validated_response["python_code"])
    print("Success!")
except Exception as e:
    print("Failed!")

In [None]:
from langchain import LLMMathChain, OpenAI, SerpAPIWrapper, SQLDatabase, SQLDatabaseChain
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI

In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", streaming=True, callbacks=[StreamingStdOutCallbackHandler()],)
# search = SerpAPIWrapper()
# llm_math_chain = LLMMathChain.from_llm(llm=llm, verbose=True)
# db = SQLDatabase.from_uri("sqlite:///../../../../../notebooks/Chinook.db")
# db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
tools = [
    # Tool(
    #     name = "Search",
    #     func=search.run,
    #     description="useful for when you need to answer questions about current events. You should ask targeted questions"
    # ),
    Tool(
        name="Calculator",
        func=llm_math_chain.run,
        description="useful for when you need to answer questions about math"
    ),
    # Tool(
    #     name="FooBar-DB",
    #     func=db_chain.run,
    #     description="useful for when you need to answer questions about FooBar. Input should be in the form of a question containing full context"
    # )
]

In [None]:
agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)

In [None]:
agent.run("Who is Leo DiCaprio's girlfriend? What is her current age raised to the 0.43 power?")



[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `Calculator` with `28^0.43`


[0m

[1m> Entering new  chain...[0m
28^0.43[32;1m[1;3m```text
28**0.43
```
...numexpr.evaluate("28**0.43")...
[0m
Answer: [33;1m[1;3m4.1906168361987195[0m
[1m> Finished chain.[0m
[36;1m[1;3mAnswer: 4.1906168361987195[0m[32;1m[1;3mLeo DiCaprio's girlfriend is Camila Morrone. Her current age raised to the 0.43 power is approximately 4.19.[0m

[1m> Finished chain.[0m


"Leo DiCaprio's girlfriend is Camila Morrone. Her current age raised to the 0.43 power is approximately 4.19."

In [None]:
import inspect
import re

from chartgpt.agents.agent_toolkits.bigquery.utils import get_example_query
from chartgpt.tools.python.tool import PythonAstREPLTool

credentials = service_account.Credentials.from_service_account_info(json.loads(os.environ["GCP_SERVICE_ACCOUNT"], strict=False))
bigquery_client = bigquery.Client(credentials=credentials)

tables_summary = get_tables_summary(client=bigquery_client, datasets=datasets)
example_query = get_example_query(datasets=datasets)

example_query = get_example_query(datasets=datasets)

python_tool_description = inspect.cleandoc("""                                           
You are a data science and GoogleSQL expert. Answer data and analytics questions or perform exploratory data analysis (EDA) without sharing the data source.

When unable to complete an analysis or find an answer, respond with "Analysis failed: <reason>".
After completing an analysis, respond with "Analysis complete: <final answer or insight>".

# Tools
Utilize ONLY these tools for analysis, following their expected formatting instructions.

A Python shell. Use this to execute python commands including: BigQuery queries, Pandas analytics, Plotly charts.
Input should be a valid python command.
When using this tool, sometimes output is abbreviated -
make sure it does not look abbreviated before using it in your answer.
                                           
# Datasets
Access these datasets, tables, and columns:
```
tables_summary = {tables_summary}
```

Validate column names using: tables_summary[dataset_id][table_id].

# Example SQL Query

```
{example_query}
```

# Python Libraries
The following Python libraries are available in the environment: [streamlit, plotly, pandas, numpy, sklearn, scipy, statsmodels]

The following Python modules have been imported already:
```
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
```

Do not try import or use other libraries.

# Instructions
- A BigQuery Client in Python, `bigquery_client`, has been initialized and authenticated.
- Use the Plotly library for creating charts and plots.
- Do NOT make DML statements (INSERT, UPDATE, DELETE, DROP, etc.).
- Check column names using: print(tables_summary[dataset_id][table_id])
- Always prefer performing complex queries using Pandas rather than SQL.
- Unless displaying Plotly charts and Pandas DataFrames, use `print()` to display output, for example on the last line of code.

# Data Analysis Guidelines
- If asked a geographical question, try use a Plotly map.
- Always check what unique values are in a column before querying it e.g. `SELECT DISTINCT column_name FROM table_name`.
- When performing EDA, always try check correlation and create statistical plots.
""")
                                           
python_tool = PythonAstREPLTool(
    description=python_tool_description,
    locals={"tables_summary": tables_summary, "bigquery_client": bigquery_client, "example_query": example_query},
)

def query_post_processing(query: str) -> str:
    query = query.replace("print(", "display(")
    imports = inspect.cleandoc("""
    # Add custom imports and config here for agent
    import streamlit as st
    import plotly.express as px
    import plotly.graph_objects as go
    import pandas as pd

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 5)

    def display(*args):
        import streamlit as st
        st.write(*args)
        return args
    """)
    query = imports + "\n" + query
    query = re.sub(".*client =.*\n?", "client = bigquery_client", query)
    query = re.sub(".*bigquery_client =.*\n?", "", query)
    return query

from langchain.tools.human.tool import HumanInputRun

python_tool.query_post_processing = query_post_processing
tools = [python_tool]

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = ChatOpenAI(
    temperature=0,
    model="gpt-3.5-turbo-0613",
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)

agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.OPENAI_MULTI_FUNCTIONS,
    verbose=True,
    max_iterations=5,
    early_stopping_method="generate",
    streaming=True,
    # callbacks=[StreamingStdOutCallbackHandler()],
)

In [None]:
agent.run("What is the APR on ***REMOVED***?")



[1m> Entering new  chain...[0m
I'm sorry, but as an AI assistant, I don't have real-time information. The APR (Annual Percentage Rate) on ***REMOVED*** may vary depending on various factors such as the specific loan terms, collateral, and market conditions. It's best to visit the ***REMOVED*** platform or contact their support team for the most up-to-date information on APR rates.[32;1m[1;3mI'm sorry, but as an AI assistant, I don't have real-time information. The APR (Annual Percentage Rate) on ***REMOVED*** may vary depending on various factors such as the specific loan terms, collateral, and market conditions. It's best to visit the ***REMOVED*** platform or contact their support team for the most up-to-date information on APR rates.[0m

[1m> Finished chain.[0m


"I'm sorry, but as an AI assistant, I don't have real-time information. The APR (Annual Percentage Rate) on ***REMOVED*** may vary depending on various factors such as the specific loan terms, collateral, and market conditions. It's best to visit the ***REMOVED*** platform or contact their support team for the most up-to-date information on APR rates."

In [None]:
code = """
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

def display(*args):
    import streamlit as st
    st.write(*args)
    return args

import numpy as np

# Create a range of x values
x = np.linspace(0, 2*np.pi, 100)

# Create a figure
fig = go.Figure(
    data=[go.Scatter(x=x, y=np.sin(x), mode='lines')],
    layout=go.Layout(
        title="Sine Wave Animation",
        updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])]),
    frames=[go.Frame(
        data=[go.Scatter(
            x=x,
            y=np.sin(x + np.pi / 15 * (i % 50)),
            mode='lines')]
    ) for i in range(100)]
)

fig.show()
"""
import ast

tree = ast.parse(code)
module = ast.Module(tree.body[:-1], type_ignores=[])

_globals = _locals = {}

exec(ast.unparse(module), _globals, _locals)


NameError: name 'go' is not defined

In [None]:
go

<module 'plotly.graph_objects' from '/opt/homebrew/lib/python3.11/site-packages/plotly/graph_objects/__init__.py'>