In [None]:
# import dependencies
import logging
import sys
import clickhouse_connect
import openai
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.indices.vector_store import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores.types import VectorStoreInfo, MetadataInfo
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.clickhouse import ClickHouseVectorStore
from llama_index.core.indices.vector_store.retrievers.auto_retriever.prompts import PREFIX, EXAMPLES
from llama_index.llms.openai import OpenAI

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
# set these according to your ClickHouse instance
username = "default"
password = ""
host = "localhost"
secure = False
database = "default"
http_port = 8123
# see https://platform.openai.com/account/api-keys for API key
openai.api_key = "<INSERT KEY>"

In [None]:
# custom prompt and specify embedding model to use
CLICKHOUSE_CUSTOM_SUFFIX = """
The following is the datasource schema to work with. 
IMPORTANT: Make sure that filters are only used as needed and only suggest filters for fields in the data source.

Data Source:
```json
{info_str}
```

User Query:
{query_str}

Structured Request:
"""

CLICKHOUSE_VECTOR_STORE_QUERY_PROMPT_TMPL = PREFIX + EXAMPLES + CLICKHOUSE_CUSTOM_SUFFIX

Settings.embed_model = FastEmbedEmbedding(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        max_length=384,
        cache_dir="./embeddings/"
)

In [None]:
# Define clickhouse client and vector store with custom prompt
client = clickhouse_connect.get_client(
    host=host, port=http_port, username=username, password=password, secure=secure,
)
vector_store = ClickHouseVectorStore(clickhouse_client=client, table="hackernews")
vector_index = VectorStoreIndex.from_vector_store(vector_store)
vector_store_info = VectorStoreInfo(
        content_info="Social news posts and comments from users",
        metadata_info=[
            MetadataInfo(
                name="post_score", type="int", description="Score of the comment or post",
            ),
            MetadataInfo(
                name="by", type="str", description="the author or person who posted the comment",
            ),
            MetadataInfo(
                name="time", type="date", description="the time at which the post or comment was made",
            ),
        ]
    )

# A retriever for vector store index that uses an LLM to automatically set vector store query parameters.
vector_auto_retriever = VectorIndexAutoRetriever(
    vector_index, vector_store_info=vector_store_info, similarity_top_k=10,
    prompt_template_str=CLICKHOUSE_VECTOR_STORE_QUERY_PROMPT_TMPL, llm=OpenAI(model="gpt-4"),
)

retriever_query_engine = RetrieverQueryEngine.from_args(vector_auto_retriever, llm=OpenAI(model="gpt-4"))

In [None]:
response = retriever_query_engine.query("What is the user zX41ZdbW saying about ClickHouse?")

print(f"Answer: {str(response)}")