In [38]:
from IPython.display import display, HTML, Markdown

In [23]:
import pickle
import re
import tiktoken
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import logging

logger = logging.getLogger(__name__)

### Test data docs in retriever 

In [6]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/datadocs_2023-08-16.pkl", "rb") as f:
    data_docs = pickle.load(f)
    

In [16]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100)
split_docs = splitter.split_documents(data_docs)
vectorstrore = FAISS.from_documents(split_docs, OpenAIEmbeddings())

In [25]:
ret = vectorstrore.as_retriever(search_kwargs={"k":4})

llm = ChatOpenAI(temperature=0.)
chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=ret)

In [73]:
ques = "what is the contract address for ATOM / USD on moonbeam network?"
r_docs = ret.get_relevant_documents(ques)
answer = chain(ques)
Markdown(answer["answer"])

The contract address for ATOM / USD on the Moonbeam network is "0x4f152d143c97b5e8d2293bc5b2380600f274a5dd".


In [69]:
Markdown(r_docs[0].page_content)

The following is the details for the pair AAVE / USD which operates on the Metis Mainnet. This asset is named "Aave". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x54389e89a5ec1d4312d5b5c48055d6e56a177bf9

# Retriever

In [2]:
class CustomeSplitter:
    def __init__(self, chunk_threshold=6000, chunk_size=6000, chunk_overlap=50):
        self.chunk_threshold = chunk_threshold
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.enc = tiktoken.get_encoding("cl100k_base")
        self.splitter = TokenTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )

    def token_counter(self, document):
        tokens = self.enc.encode(document.page_content)
        return len(tokens)

    def split(self, documents):
        chunked_documents = []
        for i, doc in enumerate(documents):
            try:
                if self.token_counter(doc) > self.chunk_threshold:
                    chunks = self.splitter.split_documents([doc])
                    chunks = [
                        Document(
                            page_content=chunk.page_content,
                            metadata={
                                "source": f"{chunk.metadata['source']} chunk {i}"
                            },
                        )
                        for i, chunk in enumerate(chunks)
                    ]
                    chunked_documents.extend(chunks)
                else:
                    chunked_documents.append(doc)
            except Exception as e:
                chunked_documents.append(doc)
                print(f"Error on document {i}")
                print(e)
                print(doc.metadata["source"])

        return chunked_documents


class CustomRetriever(BaseRetriever, BaseModel):
    full_docs: List[Document]
    base_retriever: BaseRetriever = None
    k_final: int = 4

    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        full_docs: List[Document],
        vector_store: FAISS,
        search_kwargs: Dict[str, Any] = {},
        k_initial: int = 10,
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        # splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
        # split_docs = splitter.split_documents(full_docs)
        # vector_store = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

        return cls(
            full_docs=full_docs,
            base_retriever=vector_store.as_retriever(search_kwargs={"k": k_initial}),
            logger=logger,
        )

    def get_relevant_documents(self, query: str, workflow:int=1) -> List[Document]:

        results =  self.base_retriever.get_relevant_documents(query=query)
        self.logger.info(f"Retrieved {len(results)} documents")
        self.logger.info(f"Worflow: {workflow}")
        
        if workflow == 2:
            doc_ids = [doc.metadata["source"] for doc in results]

            # make it a set but keep the order
            doc_ids = list(dict.fromkeys(doc_ids))[:self.k_final]

            # log to the logger
            self.logger.info(f"Retrieved {len(doc_ids)} unique documents")

            # get upto 4 documents
            full_retrieved_docs = [d for d in self.full_docs if d.metadata["source"] in doc_ids]

            return self.prepare_source(full_retrieved_docs)

        full_retrieved_docs = results[:self.k_final]
        return self.prepare_source(full_retrieved_docs)
        
    async def aget_relevant_documents(self, query: str) -> List[Document]:
        raise NotImplementedError

    def prepare_source(self, documents: List[Document]) -> List[Document]:
        
        for doc in documents:
            source = doc.metadata["source"]
            if "chunk" in source:
                source = source.split("chunk")[0].strip()
                doc.metadata["source"] = source

        return documents

In [3]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/documents_2023-08-14.pkl", "rb") as f:
    documents = pickle.load(f)

In [5]:
full_doc_splitter = CustomeSplitter()
chunked_full_documents = full_doc_splitter.split(documents)

splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
split_docs = splitter.split_documents(documents)

vectorstore = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-apfNELnY4pAbHrx6LItJCss8 on tokens per min. Limit: 1000000 / min. Current: 793893 / min. Contact us through our help center at help.openai.com if you continue to have issues..


In [6]:
with open("/home/marshath/play/chainlink/assistant/data/vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

In [7]:
with open("/home/marshath/play/chainlink/assistant/data/vectorstore.pkl", "rb") as f:
    vectorstore = pickle.load(f)

In [8]:
retriever = CustomRetriever.from_documents(
    chunked_full_documents, 
    vector_store=vectorstore,
    k_initial=10, 
    k_final=4, 
    logger=logger
)

In [51]:
r = retriever.get_relevant_documents("ora", workflow=2)

TypeError: TFIDFRetriever.get_relevant_documents() got an unexpected keyword argument 'workflow'

In [52]:
for e in r:
    print(e.metadata['source'])

https://blog.chain.link/introducing-the-cross-chain-interoperability-protocol-ccip/
https://blog.chain.link/chainlink-enterprise-blockchain-middleware/
https://blog.chain.link/breaking-down-mixicles-and-its-potential-to-unlock-enterprise-demand-for-defi-applications-on-public-blockchains/


# Search

In [15]:
from langchain.retrievers import TFIDFRetriever

In [29]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/blog_2023-08-14.pkl", "rb") as f:
    blogs = pickle.load(f)

In [30]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/techdocs_2023-08-14.pkl", "rb") as f:
    techdocs = pickle.load(f)

In [32]:
blog_ret = TFIDFRetriever.from_documents(blogs, k=30)
techdocs_ret = TFIDFRetriever.from_documents(techdocs, k=30)

In [48]:
class SearchRetriever(BaseRetriever, BaseModel):
    # blog_docs: List[Document]
    # tech_docs: List[Document]
    blog_retriever: BaseRetriever = None
    tech_retriever: BaseRetriever = None
    k_final: int = 4
    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        blog_docs: List[Document],
        tech_docs: List[Document],
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        blog_ret = TFIDFRetriever.from_documents(blog_docs, k=30)
        tech_ret = TFIDFRetriever.from_documents(tech_docs, k=30)

        return cls(
            blog_retriever=blog_ret,
            tech_retriever=tech_ret,
            k_final=k_final,
            logger=logger,
        )

    def get_relevant_documents(self, query: str, type_:str='all') -> List[Document]:
        """
        Get relevant documents for a given query.

        param query: The query to search for.
        param type_: The type of documents to search for. Can be 'blog', 'tech', or 'all'.
        """

        if type_ == "blog":
            r_docs = self.blog_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "technical_document":
            r_docs = self.tech_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "all":
            r_docs_1 = self.blog_retriever.get_relevant_documents(query)
            r_docs_2 = self.tech_retriever.get_relevant_documents(query)

            # Merge the two lists; one object per document
            r_docs = []
            for doc1, doc2 in zip(r_docs_1, r_docs_2):
                r_docs.append(doc1.metadata)
                r_docs.append(doc2.metadata)

            return r_docs[:self.k_final]

        raise ValueError("type_ must be one of 'blog', 'technical_document', or 'all'")
    
    def aget_relevant_documents(self):
        raise NotImplementedError("This method is not implemented yet.")

In [49]:
ret = SearchRetriever.from_documents(blog_docs=blogs, tech_docs=techdocs, k_final=10, logger=logger)

In [59]:
ret.get_relevant_documents("oracle", type_="technical_document")

[{'source': 'https://docs.chain.link/architecture-overview/architecture-request-model/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/getting-started/advanced-tutorial/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/any-api/find-oracle/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/any-api/api-reference/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/any-api/get-request/examples/large-responses/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/architecture-overview/architecture-decentralized-model/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/any-api/get-request/examples/existing-job-request/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/chainlink-functions/api-reference/functions-client/',
  'type': 'technical_document'},
 {'source': 'https://docs.chain.link/chainlink-nodes/oracle-jobs/jobs/',
  'type': 'techni