In [1]:
from IPython.display import display, HTML, Markdown

In [2]:
import re
import faiss
import pickle
import tiktoken
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

import logging

logger = logging.getLogger(__name__)
# Formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# stream handler
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)


### Test data docs in retriever 

In [6]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/datadocs_2023-08-16.pkl", "rb") as f:
    data_docs = pickle.load(f)
    

In [16]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100)
split_docs = splitter.split_documents(data_docs)
vectorstrore = FAISS.from_documents(split_docs, OpenAIEmbeddings())

In [25]:
ret = vectorstrore.as_retriever(search_kwargs={"k":4})

llm = ChatOpenAI(temperature=0.)
chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=ret)

In [73]:
ques = "what is the contract address for ATOM / USD on moonbeam network?"
r_docs = ret.get_relevant_documents(ques)
answer = chain(ques)
Markdown(answer["answer"])

The contract address for ATOM / USD on the Moonbeam network is "0x4f152d143c97b5e8d2293bc5b2380600f274a5dd".


In [69]:
Markdown(r_docs[0].page_content)

The following is the details for the pair AAVE / USD which operates on the Metis Mainnet. This asset is named "Aave". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x54389e89a5ec1d4312d5b5c48055d6e56a177bf9

### Three point router

In [3]:
class CustomeSplitter:
    def __init__(self, chunk_threshold=6000, chunk_size=6000, chunk_overlap=50):
        self.chunk_threshold = chunk_threshold
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.enc = tiktoken.get_encoding("cl100k_base")
        self.splitter = TokenTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )

    def token_counter(self, document):
        tokens = self.enc.encode(document.page_content)
        return len(tokens)

    def split(self, documents):
        chunked_documents = []
        for i, doc in enumerate(documents):
            try:
                if self.token_counter(doc) > self.chunk_threshold:
                    chunks = self.splitter.split_documents([doc])
                    chunks = [
                        Document(
                            page_content=chunk.page_content,
                            metadata={
                                "source": f"{chunk.metadata['source']} chunk {i}"
                            },
                        )
                        for i, chunk in enumerate(chunks)
                    ]
                    chunked_documents.extend(chunks)
                else:
                    chunked_documents.append(doc)
            except Exception as e:
                chunked_documents.append(doc)
                print(f"Error on document {i}")
                print(e)
                print(doc.metadata["source"])

        return chunked_documents


class CustomRetriever(BaseRetriever, BaseModel):
    full_docs: List[Document]
    base_retriever_all: BaseRetriever = None
    base_retriever_data: BaseRetriever = None
    k_initial: int = 10
    k_final: int = 4

    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        full_docs: List[Document],
        vectorstore_all: FAISS,
        vectorstore_data: FAISS,
        search_kwargs: Dict[str, Any] = {},
        k_initial: int = 10,
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        # splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
        # split_docs = splitter.split_documents(full_docs)
        # vector_store = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

        return cls(
            full_docs=full_docs,
            base_retriever_all=vectorstore_all.as_retriever(search_kwargs={"k": k_initial}),
            base_retriever_data=vectorstore_data.as_retriever(search_kwargs={"k": k_initial}),
            logger=logger,
        )

    def get_relevant_documents(self, query: str, workflow:int=1) -> List[Document]:
        self.logger.info(f"Worflow: {workflow}")

        if workflow == 2:
            results = self.base_retriever_data.get_relevant_documents(query=query)
            self.logger.info(f"Retrieved {len(results)} documents")
            return results[:self.k_final]

        else:
            results =  self.base_retriever_all.get_relevant_documents(query=query)
            self.logger.info(f"Retrieved {len(results)} documents")
            if workflow == 1:
                doc_ids = [doc.metadata["source"] for doc in results]

                # make it a set but keep the order
                doc_ids = list(dict.fromkeys(doc_ids))[:self.k_final]

                # log to the logger
                self.logger.info(f"Retrieved {len(doc_ids)} unique documents")

                # get upto 4 documents
                full_retrieved_docs = [d for d in self.full_docs if d.metadata["source"] in doc_ids]

                return self.prepare_source(full_retrieved_docs)

            full_retrieved_docs = results[:self.k_final]
            return self.prepare_source(full_retrieved_docs)
        
    async def aget_relevant_documents(self, query: str) -> List[Document]:
        raise NotImplementedError

    def prepare_source(self, documents: List[Document]) -> List[Document]:
        
        for doc in documents:
            source = doc.metadata["source"]
            if "chunk" in source:
                source = source.split("chunk")[0].strip()
                doc.metadata["source"] = source

        return documents

In [4]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/documents.pkl", "rb") as f:
    documents = pickle.load(f)

with open("/home/marshath/play/chainlink/chainlink-assistant/data/datadocs_2023-08-18.pkl", "rb") as f:
    datadocs = pickle.load(f)

# Split documents into chunks for 16k model
full_doc_splitter = CustomeSplitter()
chunked_full_documents = full_doc_splitter.split(documents)

splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
split_docs = splitter.split_documents(documents)

# Create vectorstore for all documents
vectorstore_all = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

# Split documents into chunks using datadocs
split_docs_data = splitter.split_documents(datadocs)

# Create vectorstore for datadocs
vectorstore_data = FAISS.from_documents(split_docs_data, embedding=OpenAIEmbeddings())

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-apfNELnY4pAbHrx6LItJCss8 on tokens per min. Limit: 1000000 / min. Current: 849774 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-apfNELnY4pAbHrx6LItJCss8 on tokens per min. Limit: 1000000 / min. Current: 874278 / min. Contact us through our help center at help.openai.com if you continue to have issues..


In [5]:
# Save vectorstore_all
faiss.write_index(vectorstore_all.index, "docs_all.index")
vectorstore_all.index = None
with open("faiss_store_all.pkl", "wb") as f:
    pickle.dump(vectorstore_all, f)

# Save vectorstore_data
faiss.write_index(vectorstore_data.index, "docs_data.index")
vectorstore_data.index = None
with open("faiss_store_data.pkl", "wb") as f:
    pickle.dump(vectorstore_data, f)

In [11]:
# open vectorstore all
index_all = faiss.read_index('/home/marshath/play/chainlink/chainlink-assistant/nbs/docs_all.index')
index_data = faiss.read_index('/home/marshath/play/chainlink/chainlink-assistant/nbs/docs_data.index')

with open('/home/marshath/play/chainlink/chainlink-assistant/nbs/faiss_store_all.pkl', 'rb') as f:
    vectorstore_all = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/nbs/faiss_store_data.pkl', 'rb') as f:
    vectorstore_data = pickle.load(f)

vectorstore_all.index = index_all
vectorstore_data.index = index_data

In [12]:


retriever = CustomRetriever.from_documents(
    chunked_full_documents, 
    vectorstore_all=vectorstore_all,
    vectorstore_data=vectorstore_data,
    k_initial=10, 
    k_final=4, 
    logger=logger
)

In [13]:
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

final_answer_system_template = """
As an AI assistant helping answer a user's question about Chainlink, your task is to provide the answer to the user's question based on the collection of documents provided. Each document is demarcated by the 'Source:' tag. 

In most cases, the answer to the user's question can be found in one of the documents.

If the documents do not contain the required information to answer user's question, respond with 'I don't know'. In this case, you can provide a link to the Chainlink documentation.

Each point in your answer should be formatted with corresponding reference(s) using markdown. Conclude your response with a footnote that enumerates all the references involved. Please make sure to use only the references provided in the documents and not to use any external references. 

The footnote should be formatted as follows: 
```
References:
[^1^]: <reference 1> 
[^2^]: <reference 2> 
[^3^]: <reference 3>
```
Please avoid duplicating references. For example, if the same reference is used twice in the answer, please only include it once in the footnote.
"""

final_answer_human_template = """
User's question: {question}

Document: {document}

Answer:
"""

FINAL_ANSWER_PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(final_answer_system_template),
        HumanMessagePromptTemplate.from_template(final_answer_human_template),
    ]
)

final_answer_2_system_template = """
As an AI assistant helping answer a user's question about Chainlink, your task is to provide the answer to the user's question based on the potential answers derived from previous LLM call(s). 
If the document doesn't contain the required information, respond with 'I don't know'.
Each point in your answer should be formatted with corresponding reference(s) using markdown. Conclude your response with a footnote that enumerates all the references involved. 

The footnote should be formatted as follows: 
```
References:
[^1^]: <reference 1> 
[^2^]: <reference 2> 
[^3^]: <reference 3>
```
Please avoid duplicating references. For example, if the same reference is used twice in the answer, please only include it once in the footnote.
"""

final_answer_2_human_template = """
User's question: {question}

Document: {document}

Answer:
"""

FINAL_ANSWER_2_PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(final_answer_2_system_template),
        HumanMessagePromptTemplate.from_template(final_answer_2_human_template),
    ]
)

router_system_prompt = """
As an AI assistant helping ansswer a user's question about Chainlink, your first task is to route the question to the proper workflow. 
There are three workflows:
    1. short-form which is suitable for simple questions. It is bad at answering questions requiring code output.
    2. long-form which is suitable for complex questions. It is good at answering questions requiring code output.
    3. is a specialized workflow for answering questions about Chainlink's price feeds. It is good at answering questions about Chainlink's price feeds.

Sample questions for each workflow:

Workflow 1: short-form
- What is Chainlink?
- What is a Chainlink node?
- What is a Chainlink oracle?

Workflow 2: long-form
- give me a sample solidity contract to use Chainlink price feeds?
- give me a sample solidity contract to use Chainlink VRF?
- give me a sample solidity contract to use NFT Floor Price Feeds?
- Give code examples to demonstrate how to deploy a consumer contract on-chain that reads a data feed and stores the value?

Workflow 3: specialized
- check if a feed is verified, ex: Is eth/usd a verified feed?
- is eth/usd feed backed by staking?
- under what asset class does eth/usd fall?
- what is the tier of the eth/usd feed on binance?
- what is the deviation threshold of eth/usd on binance?
- how many oracles carry eth/usd on binance?

Your output should be a number between 0 and 2. 0 means the question should be routed to the short-form workflow. 1 means the question should be routed to the long-form workflow. 2 means the question should be routed to the specialized workflow.
"""

router_human_prompt = """
Question: {question}
"""

ROUTER_PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(router_system_prompt),
        HumanMessagePromptTemplate.from_template(router_human_prompt),
    ]
)

In [14]:
def get_streaming_chain(chain, workflow):
    """Return a new streaming chain."""

    if workflow == 1:
        llm_stream = ChatOpenAI(
            temperature=0.0,
            model="gpt-3.5-turbo-16k",
            streaming=True,
            callbacks=[StreamingStdOutCallbackHandler()],
        )
        logger.info("Using long-form workflow")
        chain.llm = llm_stream
    else:
        llm_stream = ChatOpenAI(
            temperature=0.0,
            model="gpt-3.5-turbo",
            streaming=True,
            # max_tokens=256,
            callbacks=[StreamingStdOutCallbackHandler()],
        )
        chain.llm = llm_stream
        logger.info("Using short-form workflow")

    return chain

llm = ChatOpenAI(temperature=0.)
base_chain = LLMChain(llm=llm, prompt=FINAL_ANSWER_PROMPT)

In [15]:

model = "gpt-3.5-turbo"
try:
    encoding = tiktoken.encoding_for_model(model)
except KeyError:
    logger.info(f"Encoding for model {model} not found. Using default encoding.")
    encoding = tiktoken.get_encoding("cl100k_base")


def calculate_tokens(document, encoding):
    """Calculate the number of tokens in a list of documents."""
    return len(encoding.encode(document))


def concatenate_documents(documents, max_tokens):
    """Combine documents up to a certain token limit."""
    combined_docs = ""
    token_count = 0
    used_docs = []

    for doc in documents:
        doc_tokens = calculate_tokens(doc.page_content, encoding)
        if (token_count + doc_tokens) <= max_tokens:
            combined_docs += f"\n\n{doc.page_content}\nSource: {doc.metadata['source']}"
            token_count += doc_tokens
            used_docs.append(doc)

    return combined_docs, used_docs


def call_llm_final_answer(question, document, chain, stream=False):
    """Call LLM with a question and a single document."""
    chain.prompt = FINAL_ANSWER_PROMPT
    if stream:
        return chain.apredict(question=question, document=document)
    else:
        return chain.predict(question=question, document=document)


def call_llm_final_2_answer(question, document, chain):
    """Call LLM with a question and a single document."""
    chain.prompt = FINAL_ANSWER_2_PROMPT
    return chain.apredict(question=question, document=document)


def process_documents(question, chain, max_tokens=14_000):
    """Process a list of documents with LLM calls."""
    
    # Use router to decide which workflow to use
    chain.prompt = ROUTER_PROMPT
    try:
        workflow = int(chain.predict(question=question))
    except Exception as e:
        logger.error(f"Error in router: {e}")
        workflow = 0

    logger.info(f"Using workflow {workflow}")
    print(f"Using workflow {workflow}")

    documents = retriever.get_relevant_documents(question, workflow=workflow)
    batches = []
    num_llm_calls = 0
    while documents:
        batch, used_docs = concatenate_documents(documents, max_tokens)
        batches.append(batch)
        # logger.info(f"Calling LLM with {batch}")
        documents = [doc for doc in documents if doc not in used_docs]
        num_llm_calls += 1
        logger.info(
            f"LLM call {num_llm_calls} complete. {len(documents)} documents remaining."
        )

    return batches, num_llm_calls, workflow


async def get_answer(question,max_tokens=14_000):
    """Get an answer to a question."""

    # Main code that calls process_documents
    batches, num_llm_calls, workflow = process_documents(
        question=question, 
        max_tokens=max_tokens,
        chain=base_chain
    )

    # Get the streaming chain
    chain_stream = get_streaming_chain(
        chain=base_chain,
        workflow=workflow
    )

    if num_llm_calls == 1:
        result = await call_llm_final_answer(
            question=question, 
            document=batches[0], 
            chain=chain_stream, 
            stream=True
        )
        return result

    else:
        # Handle the list of batches
        results = []
        for batch in batches:
            result = call_llm_final_answer(
                question=question, 
                document=batch, 
                chain=base_chain
            )
            results.append(result)

        combined_result = " ".join(results)

        logger.info(f"Final LLM call with {len(results)} results.")
        combined_result = await call_llm_final_2_answer(
            question=question, 
            document=combined_result, 
            chain=chain_stream
        )

        return combined_result


In [16]:
await get_answer("Write code to show how to use web3.js to retrieve feed data from a price feed?")

Using workflow 1
To retrieve feed data from a price feed using web3.js, you can follow these steps:

1. Import the necessary dependencies:
```javascript
const Web3 = require('web3');
const contractABI = require('<path_to_contract_ABI>');
```

2. Create a new instance of the web3 object:
```javascript
const web3 = new Web3('<provider_url>');
```

3. Get the contract instance using the contract ABI and contract address:
```javascript
const contract = new web3.eth.Contract(contractABI, '<contract_address>');
```

4. Call the contract function to retrieve the feed data:
```javascript
contract.methods.getLatestPrice().call((error, result) => {
  if (error) {
    console.error(error);
  } else {
    console.log(result);
  }
});
```

Replace `<path_to_contract_ABI>`, `<provider_url>`, and `<contract_address>` with the actual values for your contract.

This code assumes that you have the contract ABI and the contract address. If you don't have them, you can refer to the Chainlink documentation

"To retrieve feed data from a price feed using web3.js, you can follow these steps:\n\n1. Import the necessary dependencies:\n```javascript\nconst Web3 = require('web3');\nconst contractABI = require('<path_to_contract_ABI>');\n```\n\n2. Create a new instance of the web3 object:\n```javascript\nconst web3 = new Web3('<provider_url>');\n```\n\n3. Get the contract instance using the contract ABI and contract address:\n```javascript\nconst contract = new web3.eth.Contract(contractABI, '<contract_address>');\n```\n\n4. Call the contract function to retrieve the feed data:\n```javascript\ncontract.methods.getLatestPrice().call((error, result) => {\n  if (error) {\n    console.error(error);\n  } else {\n    console.log(result);\n  }\n});\n```\n\nReplace `<path_to_contract_ABI>`, `<provider_url>`, and `<contract_address>` with the actual values for your contract.\n\nThis code assumes that you have the contract ABI and the contract address. If you don't have them, you can refer to the Chainlin