In [1]:
import pickle
import re
import tiktoken
import numpy as np
from tqdm import tqdm
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import BM25Retriever, TFIDFRetriever
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
with open("/Users/arshath/play/chainlink-assistant/data/blog_2023-08-18.pkl", "rb") as f:
    blog_docs = pickle.load(f)

with open("/Users/arshath/play/chainlink-assistant/data/chain_link_main_docs_2023-08-18.pkl", "rb") as f:
    chainlink_docs = pickle.load(f)

with open("/Users/arshath/play/chainlink-assistant/data/chain_link_you_tube_docs_2023-08-18.pkl", "rb") as f:
    chainlink_youtube_docs = pickle.load(f)

with open("/Users/arshath/play/chainlink-assistant/data/datadocs_2023-08-18.pkl", "rb") as f:
    data_docs = pickle.load(f)

with open("/Users/arshath/play/chainlink-assistant/data/techdocs_2023-08-18.pkl", "rb") as f:
    tech_docs = pickle.load(f)

In [4]:
# Remove duplicates from chainlink_docs
set_text = set()
filtered_chainlink_docs = []
for doc in chainlink_docs:
    d = doc.page_content
    if d not in set_text:
        filtered_chainlink_docs.append(doc)
        set_text.add(d)

In [28]:
class SearchRetriever(BaseRetriever, BaseModel):
    blog_retriever: TFIDFRetriever
    tech_retriever: TFIDFRetriever
    data_retriever: TFIDFRetriever
    chain_link_retriever: TFIDFRetriever
    chain_link_youtube_retriever: TFIDFRetriever
    all_docs_retriever: TFIDFRetriever
    networks = [
        'ethereum', 'polygon', 'optimism', 'fantom', 'harmony',
        'moonriver', 'metis', 'bnb', 'arbitrum', 'avalanche',
        'gnosis', 'base', 'moonbeam'
    ]
    k_final: int = 20
    class Config:
        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        blog_docs: List[Document],
        tech_docs: List[Document],
        data_docs: List[Document],
        chain_link_docs: List[Document],
        chain_link_youtube_docs: List[Document],
        k_final: int = 20,
        logger: Any = None,
        **kwargs: Any
    ):
        # Remove duplicates from chainlink_docs
        unique_texts = {doc.page_content: doc for doc in chain_link_docs}
        filtered_chainlink_docs = list(unique_texts.values())

        blog_ret = TFIDFRetriever.from_documents(blog_docs, k=30)
        tech_ret = TFIDFRetriever.from_documents(tech_docs, k=30)
        data_ret = TFIDFRetriever.from_documents(data_docs, k=30)
        chain_link_ret = TFIDFRetriever.from_documents(filtered_chainlink_docs, k=30)
        chain_link_youtube_ret = TFIDFRetriever.from_documents(chain_link_youtube_docs, k=30)
        
        all_docs = blog_docs + tech_docs + filtered_chainlink_docs + chain_link_youtube_docs
        all_docs_ret = TFIDFRetriever.from_documents(all_docs, k=30)

        return cls(
            blog_retriever=blog_ret,
            tech_retriever=tech_ret,
            data_retriever=data_ret,
            chain_link_retriever=chain_link_ret,
            chain_link_youtube_retriever=chain_link_youtube_ret,
            all_docs_retriever=all_docs_ret,
            k_final=k_final,
            logger=logger
        )

    def get_relevant_documents(self, query: str, type_: str = 'all') -> List[Document]:
        r_docs = []

        if type_ == "all":
            matching_docs_for_pair = self.find_texts_for_pair(query, self.data_retriever.docs)
            
            if matching_docs_for_pair:
                ordered_texts = self.reorder_matched_texts_by_network(query, matching_docs_for_pair)
                r_docs.extend([doc.metadata for doc in ordered_texts[:2]])

            # Add 5 from all docs if not already added to r_docs
            
            r_docs.extend([doc.metadata for doc in self.all_docs_retriever.get_relevant_documents(query)[:5]])

            retrievers = [
                self.tech_retriever,
                self.blog_retriever,
                self.chain_link_retriever,
                self.chain_link_youtube_retriever
            ]

            for ret in retrievers:
                for doc in ret.get_relevant_documents(query):
                    if doc.metadata not in r_docs:
                        r_docs.append(doc.metadata)
                    if len(r_docs) >= self.k_final:
                        break

            

        elif type_ in ["blog", "technical_document"]:
            retriever = self.blog_retriever if type_ == "blog" else self.tech_retriever
            r_docs.extend([doc.metadata for doc in retriever.get_relevant_documents(query)[:self.k_final]])
        else:
            raise ValueError("type_ must be one of 'blog', 'technical_document', or 'all'")

        # Make sure no duplicates; use 'source' as unique identifier
        r_docs = list({doc['source']: doc for doc in r_docs}.values())
        return r_docs

    def extract_pair(self, query):
        pattern = r'(?i)([a-z]{3,6})\s?/\s?([a-z]{3,6})'
        matches = re.findall(pattern, query)
        return matches[0] if matches else None

    def find_texts_for_pair(self, query, docs):
        pair = self.extract_pair(query)
        
        if not pair:
            return []

        normalized_pair = ('/'.join(pair)).lower().replace(" ", "")
        matching_docs = [doc for doc in docs if normalized_pair in doc.page_content.lower().replace(" ", "")]
        
        return matching_docs

    def reorder_matched_texts_by_network(self, query, matched_docs):
        matched_networks = [net for net in self.networks if net in query.lower()]

        network_docs = [doc for doc in matched_docs if any(net in doc.page_content.lower() for net in matched_networks)]
        non_network_docs = [doc for doc in matched_docs if doc not in network_docs]

        return network_docs + non_network_docs

In [29]:
ret = SearchRetriever.from_documents(blog_docs, tech_docs, data_docs, chainlink_docs, chainlink_youtube_docs)

In [30]:
r = ret.get_relevant_documents("aave/usd"); r

[{'title': 'AAVE / USD on Ethereum Mainnet',
  'description': 'Details for AAVE / USD on Ethereum Mainnet',
  'source_type': 'data',
  'source': 'https://data.chain.link/ethereum/mainnet/crypto-usd/aave-usd'},
 {'title': 'AAVE / USD on Polygon Mainnet',
  'description': 'Details for AAVE / USD on Polygon Mainnet',
  'source_type': 'data',
  'source': 'https://data.chain.link/polygon/mainnet/crypto-eth/aave-usd'},
 {'source': 'https://blog.chain.link/introducing-the-chainlink-on-chain-data-directory/',
  'source_type': 'blog',
  'title': 'Introducing the Chainlink On-Chain Data Directory: Data.eth',
  'description': 'Chainlink introduces the Chainlink On-Chain Data Directory, Data.eth, to enhance security features for DeFi applications.'},
 {'source': 'https://blog.chain.link/craft-whiskey-crypto-payments-with-chainlink-oracles/',
  'source_type': 'blog',
  'title': 'Craft Whiskey Crypto Payments With Chainlink Oracles',
  'description': 'Whiskey MarketMaker, a DeFi platform, uses Chain