In [22]:
import pickle
import re
import tiktoken
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

import logging

logger = logging.getLogger(__name__)

In [67]:
class CustomeSplitter:
    def __init__(self, chunk_threshold=6000, chunk_size=6000, chunk_overlap=50):
        self.chunk_threshold = chunk_threshold
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.enc = tiktoken.get_encoding("cl100k_base")
        self.splitter = TokenTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )

    def token_counter(self, document):
        tokens = self.enc.encode(document.page_content)
        return len(tokens)

    def split(self, documents):
        chunked_documents = []
        for i, doc in enumerate(documents):
            try:
                if self.token_counter(doc) > self.chunk_threshold:
                    chunks = self.splitter.split_documents([doc])
                    chunks = [
                        Document(
                            page_content=chunk.page_content,
                            metadata={
                                "source": f"{chunk.metadata['source']} chunk {i}"
                            },
                        )
                        for i, chunk in enumerate(chunks)
                    ]
                    chunked_documents.extend(chunks)
                else:
                    chunked_documents.append(doc)
            except Exception as e:
                chunked_documents.append(doc)
                print(f"Error on document {i}")
                print(e)
                print(doc.metadata["source"])

        return chunked_documents


class CustomRetriever(BaseRetriever, BaseModel):
    full_docs: List[Document]
    base_retriever: BaseRetriever = None
    k_final: int = 4

    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        full_docs: List[Document],
        vector_store: FAISS,
        search_kwargs: Dict[str, Any] = {},
        k_initial: int = 10,
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        # splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
        # split_docs = splitter.split_documents(full_docs)
        # vector_store = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

        return cls(
            full_docs=full_docs,
            base_retriever=vector_store.as_retriever(search_kwargs={"k": k_initial}),
            logger=logger,
        )

    def get_relevant_documents(self, query: str, workflow:int=1) -> List[Document]:

        results =  self.base_retriever.get_relevant_documents(query=query)
        self.logger.info(f"Retrieved {len(results)} documents")
        self.logger.info(f"Worflow: {workflow}")
        
        if workflow == 2:
            doc_ids = [doc.metadata["source"] for doc in results]

            # make it a set but keep the order
            doc_ids = list(dict.fromkeys(doc_ids))[:self.k_final]

            # log to the logger
            self.logger.info(f"Retrieved {len(doc_ids)} unique documents")

            # get upto 4 documents
            full_retrieved_docs = [d for d in self.full_docs if d.metadata["source"] in doc_ids]

            return self.prepare_source(full_retrieved_docs)

        full_retrieved_docs = results[:self.k_final]
        return self.prepare_source(full_retrieved_docs)
        
    async def aget_relevant_documents(self, query: str) -> List[Document]:
        raise NotImplementedError

    def prepare_source(self, documents: List[Document]) -> List[Document]:
        
        for doc in documents:
            source = doc.metadata["source"]
            if "chunk" in source:
                source = source.split("chunk")[0].strip()
                doc.metadata["source"] = source

        return documents

In [68]:
with open("/home/marshath/play/chainlink/assistant/data/stackoverflow_documents.pkl", "rb") as f:
    so_docs = pickle.load(f)

so_docs = list(so_docs.values())

final_docs = []
for doc in documents:
    if isinstance(doc, Document):
        final_docs.append(doc)


final_docs.extend(so_docs)

In [69]:
full_doc_splitter = CustomeSplitter()
chunked_full_documents = full_doc_splitter.split(final_docs)

splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
split_docs = splitter.split_documents(final_docs)

vectorstore = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-apfNELnY4pAbHrx6LItJCss8 on tokens per min. Limit: 1000000 / min. Current: 808816 / min. Contact us through our help center at help.openai.com if you continue to have issues..


In [58]:
with open("/home/marshath/play/chainlink/assistant/data/vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

In [59]:
with open("/home/marshath/play/chainlink/assistant/data/vectorstore.pkl", "rb") as f:
    vectorstore = pickle.load(f)

In [60]:
retriever = CustomRetriever.from_documents(
    chunked_full_documents, 
    vector_store=vectorstore,
    k_initial=10, 
    k_final=4, 
    logger=logger
)

In [65]:
r = retriever.get_relevant_documents("what is chainlink", workflow=2)

In [66]:
for e in r:
    print(e.metadata['source'])

https://blog.chain.link/chainlink-enterprise-blockchain-middleware/
https://blog.chain.link/introducing-the-cross-chain-interoperability-protocol-ccip/
https://blog.chain.link/smart-contract-use-cases/
https://blog.chain.link/smart-contract-use-cases/
https://blog.chain.link/smart-contract-use-cases/
https://blog.chain.link/smart-contract-use-cases/
https://blog.chain.link/breaking-down-mixicles-and-its-potential-to-unlock-enterprise-demand-for-defi-applications-on-public-blockchains/
