In [None]:
# import dependencies

import logging
import sys
import clickhouse_connect
import openai
from llama_index.core import SQLDatabase, PromptTemplate
from llama_index.core.indices.struct_store import NLSQLTableQueryEngine
from llama_index.core.prompts import PromptType
from llama_index.core.tools import QueryEngineTool
from sqlalchemy import create_engine
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.indices.vector_store import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine, SQLAutoVectorQueryEngine
from llama_index.core.vector_stores.types import VectorStoreInfo, MetadataInfo
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.clickhouse import ClickHouseVectorStore
from llama_index.core.indices.vector_store.retrievers.auto_retriever.prompts import PREFIX, EXAMPLES
from llama_index.llms.openai import OpenAI

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
# set these according to your ClickHouse instance
username = "default"
password = ""
host = "localhost"
secure = False
database = "default"
native_port = 9000
http_port = 8123
# see https://platform.openai.com/account/api-keys for API key
openai.api_key = "<INSERT KEY>"

In [None]:
# define prompts
CLICKHOUSE_TEXT_TO_SQL_TMPL = (
    "Given an input question, first create a syntactically correct ClickHouse SQL "
    "query to run, then look at the results of the query and return the answer. "
    "You can order the results by a relevant column to return the most "
    "interesting examples in the database.\n\n"
    "Never query for all the columns from a specific table, only ask for a "
    "few relevant columns given the question.\n\n"
    "Pay attention to use only the column names that you can see in the schema "
    "description. "
    "Be careful to not query for columns that do not exist. "
    "Pay attention to which column is in which table. "
    "Also, qualify column names with the table name when needed. \n"
    "If needing to group on Array Columns use the ClickHouse function arrayJoin e.g. arrayJoin(columnName) \n"
    "For example, the following query identifies the most popular database:\n"
    "SELECT d, count(*) AS count FROM so_surveys GROUP BY "
    "arrayJoin(database_want_to_work_with) AS d ORDER BY count DESC LIMIT 1\n "
    "Ensure if aggregating with `arrayJoin` you use an alias e.g. arrayJoin(database_want_to_work_with) AS d\n"
    "You are required to use the following format, each taking one line:\n\n"
    "Question: Question here\n"
    "SQLQuery: SQL Query to run\n"
    "SQLResult: Result of the SQLQuery\n"
    "Answer: Final answer here\n\n"
    "Only use tables listed below.\n"
    "{schema}\n\n"
    "Question: {query_str}\n"
    "SQLQuery: "
)

CLICKHOUSE_TEXT_TO_SQL_PROMPT = PromptTemplate(
    CLICKHOUSE_TEXT_TO_SQL_TMPL,
    prompt_type=PromptType.TEXT_TO_SQL,
)

In [None]:
# create NL to SQL engine
engine = create_engine(
    f'clickhouse+native://{username}:{password}@{host}:' +
    f'{native_port}/{database}?compression=lz4&secure={secure}'
)
sql_database = SQLDatabase(engine, include_tables=["surveys"], view_support=True)

nl_sql_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["surveys"],
    text_to_sql_prompt=CLICKHOUSE_TEXT_TO_SQL_PROMPT,
    llm=OpenAI(model="gpt-4"),
    verbose=True
)

In [None]:
# Define vector store with custom prompt
CLICKHOUSE_CUSTOM_SUFFIX = """
The following is the datasource schema to work with. 
IMPORTANT: Make sure that filters are only used as needed and only suggest filters for fields in the data source.

Data Source:
```json
{info_str}
```

User Query:
{query_str}

Structured Request:
"""

CLICKHOUSE_VECTOR_STORE_QUERY_PROMPT_TMPL = PREFIX + EXAMPLES + CLICKHOUSE_CUSTOM_SUFFIX
Settings.embed_model = FastEmbedEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    max_length=384,
    cache_dir="./embeddings/"
)
client = clickhouse_connect.get_client(
    host=host, port=http_port, username=username, password=password,
)
vector_store = ClickHouseVectorStore(clickhouse_client=client, table="hackernews")
vector_index = VectorStoreIndex.from_vector_store(vector_store)
vector_store_info = VectorStoreInfo(
    content_info="Social news posts and comments from users",
    metadata_info=[
        MetadataInfo(
            name="post_score", type="int", description="Score of the comment or post",
        ),
        MetadataInfo(
            name="by", type="str", description="the author or person who posted the comment",
        ),
        MetadataInfo(
            name="time", type="date", description="the time at which the post or comment was made",
        ),
    ]
)

# A retriever for vector store index that uses an LLM to automatically set vector store query parameters.
vector_auto_retriever = VectorIndexAutoRetriever(
    vector_index, vector_store_info=vector_store_info, similarity_top_k=10,
    prompt_template_str=CLICKHOUSE_VECTOR_STORE_QUERY_PROMPT_TMPL, llm=OpenAI(model="gpt-4"),
    vector_store_kwargs={"where": f"length >= 20"}
)

In [None]:
# create SQLAutoVectorQueryEngine which combines our vector search and NL->SQL engines
retriever_query_engine = RetrieverQueryEngine.from_args(vector_auto_retriever, llm=OpenAI(model="gpt-4"))
# descriptions ensure SQLAutoVectorQueryEngine knows which to use when
sql_tool = QueryEngineTool.from_defaults(
    query_engine=nl_sql_engine,
    description=(
        "Useful for translating a natural language query into a SQL query over"
        f" a table: surveys, containing the survey responses on"
        f" different types of technology users currently use and want to use"
    ),
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=retriever_query_engine,
    description=(
        f"Useful for answering semantic questions abouts users comments and posts"
    ),
)

sql_auto_vector_engine = SQLAutoVectorQueryEngine(
    sql_tool, vector_tool, llm=OpenAI(model="gpt-4")
)

In [None]:
response = sql_auto_vector_engine.query(
    "What are people's opinions on the web technology that people at companies with "
    "less than 100 employees want to work with?")

print(str(response))