In [1]:
import pickle
import re
import tiktoken
from tqdm import tqdm
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

import logging

logger = logging.getLogger(__name__)

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.retrievers import TFIDFRetriever
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

In [4]:
system_template = """
Please summarize the context below in one sentence (no more than 15 words). This will be used as the description of the article in the search results.

Response should be NO MORE THAN 15 words.
"""

human_template = """{context}"""

PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(human_template),
    ]
)

llm = ChatOpenAI(temperature=0.)
chain = LLMChain(llm=llm, prompt=PROMPT)

In [5]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/blog_2023-08-14.pkl", "rb") as f:
    blogs = pickle.load(f)

In [6]:
blog_docs = []
for blog in tqdm(blogs, total=len(blogs)):
    title = blog.page_content.split("\n\n")[0].replace("#", "").strip()
    description = chain.predict(context=blog.page_content[:500])
    metadata = {
        "title": title,
        "description": description,
        "source": blog.metadata["source"],
        "source_type": "blog",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    blog.metadata = metadata

    blog_docs.append(blog)

  0%|          | 0/500 [00:00<?, ?it/s]

 21%|██        | 103/500 [01:46<07:36,  1.15s/it]

In [None]:
with open("../data/search_blogs.pkl", 'wb') as f:
    pickle.dump(blog_docs, f)

In [None]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/techdocs_2023-08-14.pkl", "rb") as f:
    techdocs = pickle.load(f)

In [None]:
tech_docs = []
for doc in tqdm(techdocs, total=len(techdocs)):
    title = doc.page_content.split("\n\n")[0].replace("#", "").strip()
    description = chain.predict(context=doc.page_content[:1500])
    metadata = {
        "title": title,
        "description": description,
        "source": doc.metadata["source"],
        "source_type": "technical_document",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    doc.metadata = metadata

    tech_docs.append(doc)

  0%|          | 0/172 [00:00<?, ?it/s]

100%|██████████| 172/172 [04:46<00:00,  1.66s/it]


In [None]:
with open("../data/search_docs.pkl", "wb") as f:
    pickle.dump(tech_docs, f)

In [11]:
with open("../data/search_docs.pkl", "rb") as f:
    docs = pickle.load(f)

with open("../data/search_blogs.pkl", "rb") as f:
    blogs = pickle.load(f)

In [12]:
blog_ret = TFIDFRetriever.from_documents(blogs, k=30)
techdocs_ret = TFIDFRetriever.from_documents(techdocs, k=30)

In [13]:
class SearchRetriever(BaseRetriever, BaseModel):
    # blog_docs: List[Document]
    # tech_docs: List[Document]
    blog_retriever: BaseRetriever = None
    tech_retriever: BaseRetriever = None
    k_final: int = 4
    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        blog_docs: List[Document],
        tech_docs: List[Document],
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        blog_ret = TFIDFRetriever.from_documents(blog_docs, k=30)
        tech_ret = TFIDFRetriever.from_documents(tech_docs, k=30)

        return cls(
            blog_retriever=blog_ret,
            tech_retriever=tech_ret,
            k_final=k_final,
            logger=logger,
        )

    def get_relevant_documents(self, query: str, type_:str='all') -> List[Document]:
        """
        Get relevant documents for a given query.

        param query: The query to search for.
        param type_: The type of documents to search for. Can be 'blog', 'tech', or 'all'.
        """

        if type_ == "blog":
            r_docs = self.blog_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "technical_document":
            r_docs = self.tech_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "all":
            r_docs_1 = self.blog_retriever.get_relevant_documents(query)
            r_docs_2 = self.tech_retriever.get_relevant_documents(query)

            # Merge the two lists; one object per document
            r_docs = []
            for doc1, doc2 in zip(r_docs_1, r_docs_2):
                r_docs.append(doc1.metadata)
                r_docs.append(doc2.metadata)

            return r_docs[:self.k_final]

        raise ValueError("type_ must be one of 'blog', 'technical_document', or 'all'")
    
    def aget_relevant_documents(self):
        raise NotImplementedError("This method is not implemented yet.")

In [14]:
ret = SearchRetriever.from_documents(blog_docs=blogs, tech_docs=techdocs, k_final=10, logger=logger)

In [17]:
ret.get_relevant_documents("What is the deviation threshold for the pair ETH/USD on the mainnet?")

[{'title': 'Introducing the Chainlink On-Chain Data Directory: Data.eth',
  'description': 'Chainlink has launched the Chainlink On-Chain Data Directory using the Ethereum Name Service (ENS), creating an on-chain index of Chainlink Price Feed addresses to provide users with additional assurances and decentralization when relying on or sending funds to the correct on-chain address.',
  'source': 'https://blog.chain.link/introducing-the-chainlink-on-chain-data-directory/',
  'source_type': 'blog'},
 {'title': 'Call an API with HTTP Query Parameters',
  'description': 'This tutorial explains how to call an API using HTTP query parameters in a decentralized oracle network. It provides step-by-step instructions and code examples.',
  'source': 'https://docs.chain.link/chainlink-functions/tutorials/api-query-parameters/',
  'source_type': 'technical_document'},
 {'title': 'Analyze Decentralized Oracles in Real-Time With Chainlink’s Price Feed Visualizations',
  'description': "Chainlink's Pr

In [5]:
import requests

In [7]:
res = requests.post(
    url="https://api.algoverai.link/chainlink/search",
    json={
        "query": "chainlink",
        "type_": "technical_document",
    },
    headers={ "X-API-Key": "7a08c35c471a0c4e9a7e3485acc601639a07694588fb2e78a67d48c3926d1837"}
)
res.raise_for_status()
res.json()

{'results': [{'title': 'Chainlink Functions Resources',
   'description': 'A collection of resources for understanding and using Chainlink functions.',
   'source': 'https://docs.chain.link/chainlink-functions/resources/',
   'source_type': 'technical_document'},
  {'title': 'Performing System Maintenance',
   'description': 'This article provides an overview of performing system maintenance and includes examples of maintenance and image updates as well as failover node scenarios.',
   'source': 'https://docs.chain.link/chainlink-nodes/resources/performing-system-maintenance/',
   'source_type': 'technical_document'},
  {'title': 'What is Chainlink Functions?',
   'description': 'Chainlink Functions are a feature that allows developers to create custom computations and data transformations for smart contracts on the Chainlink network.',
   'source': 'https://docs.chain.link/chainlink-functions/',
   'source_type': 'technical_document'},
  {'title': 'Running a Chainlink Node',
   'descr