In [1]:
import pickle
import re
import tiktoken
from tqdm import tqdm
from pydantic import BaseModel
from typing import Any, Dict, List
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import BaseRetriever
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

import logging

logger = logging.getLogger(__name__)

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.retrievers import TFIDFRetriever
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

In [4]:
system_template = """
Please summarize the context below in one sentence (no more than 15 words). This will be used as the description of the article in the search results.

Response should be NO MORE THAN 15 words.
"""

human_template = """{context}"""

PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(human_template),
    ]
)

llm = ChatOpenAI(temperature=0.)
chain = LLMChain(llm=llm, prompt=PROMPT)

In [5]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/blog_2023-08-14.pkl", "rb") as f:
    blogs = pickle.load(f)

In [6]:
blog_docs = []
for blog in tqdm(blogs, total=len(blogs)):
    title = blog.page_content.split("\n\n")[0].replace("#", "").strip()
    description = chain.predict(context=blog.page_content[:500])
    metadata = {
        "title": title,
        "description": description,
        "source": blog.metadata["source"],
        "source_type": "blog",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    blog.metadata = metadata

    blog_docs.append(blog)

  0%|          | 0/500 [00:00<?, ?it/s]

 21%|██        | 103/500 [01:46<07:36,  1.15s/it]

In [None]:
with open("../data/search_blogs.pkl", 'wb') as f:
    pickle.dump(blog_docs, f)

In [None]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/techdocs_2023-08-14.pkl", "rb") as f:
    techdocs = pickle.load(f)

In [None]:
tech_docs = []
for doc in tqdm(techdocs, total=len(techdocs)):
    title = doc.page_content.split("\n\n")[0].replace("#", "").strip()
    description = chain.predict(context=doc.page_content[:1500])
    metadata = {
        "title": title,
        "description": description,
        "source": doc.metadata["source"],
        "source_type": "technical_document",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    doc.metadata = metadata

    tech_docs.append(doc)

  0%|          | 0/172 [00:00<?, ?it/s]

100%|██████████| 172/172 [04:46<00:00,  1.66s/it]


In [None]:
with open("../data/search_docs.pkl", "wb") as f:
    pickle.dump(tech_docs, f)

In [11]:
with open("../data/search_docs.pkl", "rb") as f:
    docs = pickle.load(f)

with open("../data/search_blogs.pkl", "rb") as f:
    blogs = pickle.load(f)

In [5]:
with open('/home/marshath/play/chainlink/chainlink-assistant/data/blog_2023-08-18.pkl', 'rb') as f:
    blog_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/techdocs_2023-08-18.pkl', 'rb') as f:
    tech_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/datadocs_2023-08-18.pkl', 'rb') as f:
    data_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_main_docs_2023-08-18.pkl', 'rb') as f:
    chain_link_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_you_tube_docs_2023-08-18.pkl', 'rb') as f:
    chain_link_youtube_docs = pickle.load(f)



In [12]:
class SearchRetriever(BaseRetriever, BaseModel):
    blog_retriever: BaseRetriever = None
    tech_retriever: BaseRetriever = None
    data_retriever: BaseRetriever = None
    chain_link_retriever: BaseRetriever = None
    chain_link_youtube_retriever: BaseRetriever = None
    k_final: int = 4
    logger: Any = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def from_documents(
        cls,
        blog_docs: List[Document],
        tech_docs: List[Document],
        data_docs: List[Document],
        chain_link_docs: List[Document],
        chain_link_youtube_docs: List[Document],
        k_final: int = 4,
        logger: Any = None,
        **kwargs: Any,
    ):
        blog_ret = TFIDFRetriever.from_documents(blog_docs, k=30)
        tech_ret = TFIDFRetriever.from_documents(tech_docs, k=30)
        data_ret = TFIDFRetriever.from_documents(data_docs, k=30)
        chain_link_ret = TFIDFRetriever.from_documents(chain_link_docs, k=30)
        chain_link_youtube_ret = TFIDFRetriever.from_documents(chain_link_youtube_docs, k=30)

        return cls(
            blog_retriever=blog_ret,
            tech_retriever=tech_ret,
            data_retriever=data_ret,
            chain_link_retriever=chain_link_ret,
            chain_link_youtube_retriever=chain_link_youtube_ret,
            k_final=k_final,
            logger=logger,
        )

    def get_relevant_documents(self, query: str, type_:str='all') -> List[Document]:
        """
        Get relevant documents for a given query.

        param query: The query to search for.
        param type_: The type of documents to search for. Can be 'blog', 'tech', or 'all'.
        """

        if type_ == "blog":
            r_docs = self.blog_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "technical_document":
            r_docs = self.tech_retriever.get_relevant_documents(query)

            # Get only the metadata from the original documents
            r_docs = [doc.metadata for doc in r_docs][:self.k_final]

            return r_docs

        if type_ == "all":
            r_docs_1 = self.blog_retriever.get_relevant_documents(query)
            r_docs_2 = self.tech_retriever.get_relevant_documents(query)
            r_docs_3 = self.data_retriever.get_relevant_documents(query)
            r_docs_4 = self.chain_link_retriever.get_relevant_documents(query)
            r_docs_5 = self.chain_link_youtube_retriever.get_relevant_documents(query)

            # Merge the two lists; one object per document
            r_docs = []
            for d1, d2, d3, d4, d5 in zip(r_docs_1, r_docs_2, r_docs_3, r_docs_4, r_docs_5):
                r_docs.append(d1)
                r_docs.append(d2)
                r_docs.append(d3)
                r_docs.append(d4)
                r_docs.append(d5)

                if len(r_docs) >= self.k_final:
                    break

            return r_docs[:self.k_final]

        raise ValueError("type_ must be one of 'blog', 'technical_document', or 'all'")
    
    def aget_relevant_documents(self):
        raise NotImplementedError("This method is not implemented yet.")

In [13]:
ret = SearchRetriever.from_documents(
    blog_docs=blog_docs, 
    tech_docs=tech_docs,
    data_docs=data_docs,
    chain_link_docs=chain_link_docs,
    chain_link_youtube_docs=chain_link_youtube_docs, 
    k_final=10, logger=logger)

In [15]:
rs = ret.get_relevant_documents("What is the deviation threshold for the pair ETH/USD on the mainnet?")

In [16]:
for r in rs:
    print(r.metadata)

{'source': 'https://blog.chain.link/introducing-the-chainlink-on-chain-data-directory/', 'source_type': 'blog', 'title': 'Introducing the Chainlink On-Chain Data Directory: Data.eth', 'description': 'Chainlink introduces the Chainlink On-Chain Data Directory, Data.eth, to enhance security features for DeFi applications.'}
{'source': 'https://docs.chain.link/chainlink-functions/tutorials/api-query-parameters/', 'source_type': 'technical_document', 'title': 'Call an API with HTTP Query Parameters', 'description': 'This is a tutorial on how to call an API using HTTP query parameters.'}
{'title': 'WBTC / USD on Harmony Mainnet', 'description': 'Details for WBTC / USD on Harmony Mainnet', 'source_type': 'data', 'source': 'https://data.chain.link/harmony/mainnet/crypto-usd/wbtc-usd'}
{'source': 'https://chain.link/education-hub/defi-ecosystem/', 'title': 'The DeFi Ecosystem', 'description': 'October 15, 2021', 'source_type': 'main'}
{'source': 'https://youtu.be/VH2Kj1mPCcQ', 'source_type': '

In [17]:
import requests

In [20]:
res = requests.post(
    url="https://api.algoverai.link/chainlink/search",
    json={
        "query": "chainlink",
        "type_": "all",
    },
    headers={ "X-API-Key": "7a08c35c471a0c4e9a7e3485acc601639a07694588fb2e78a67d48c3926d1837"}
)
res.raise_for_status()
res.json()

{'results': [{'source': 'https://blog.chain.link/three-years-on-mainnet/',
   'source_type': 'blog',
   'title': 'Three Years on Mainnet',
   'description': 'Chainlink has become a critical oracle infrastructure for Web3 since launching on Ethereum mainnet three years ago.'},
  {'source': 'https://docs.chain.link/chainlink-functions/resources/',
   'source_type': 'technical_document',
   'title': 'Chainlink Functions Resources',
   'description': 'A collection of resources for understanding and using Chainlink functions.'},
  {'title': 'LINK / USD on Polygon Mainnet',
   'description': 'Details for LINK / USD on Polygon Mainnet',
   'source_type': 'data',
   'source': 'https://data.chain.link/polygon/mainnet/crypto-usd/link-usd'},
  {'source': 'https://chain.link/ecosystem/',
   'title': 'Open-source development and a growing ecosystem of users',
   'description': 'Chainlink has a thriving open-source community and ecosystem of users.',
   'source_type': 'main'},
  {'source': 'https://

In [1]:
### add title and description chain.link

In [16]:
with open("/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_main_docs_2023-08-18.pkl", 'rb') as f:
    chain_link_docs = pickle.load(f)

with open("/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_you_tube_docs_2023-08-18.pkl", 'rb') as f:
    chain_link_youtube_docs = pickle.load(f)

In [17]:
for doc in chain_link_docs:
    if 'faq' in doc.metadata['source']:
        print(doc)

page_content="[](/)\n\nMenu\n\n[Start building](https://docs.chain.link/docs)\n\n# Integrate Chainlink  \ntrust-minimized services\n\n## Get started:\n\nOutline your use case and product requirements.\n\nFind the right solutions for your company\n\nReceive guidance and technical expertise integrating Chainlink\n\nNew to Chainlink and smart contracts?\n\n[Read the docs](https://docs.chain.link/)\n\n### Talk to an expert\n\nEmail address*\n\nWhat’s the website of your project/company?*\n\nWhat brought you to Chainlink?*Select one...Integrate Market and Data\nFeedsIntegrate FunctionsIntegrate AutomationIntegrate Verifiable Random\nFunction (VRF)Integrate Proof of Reserve (PoR)Build Cross-Chain\nApplicationsBecome a Node OperatorBecome a Data ProviderOther/I don't know\n\nWhat blockchain(s) are you building on?* Select one...ArbitrumAvalancheBNB\nChainEthereumFantomOptimismPolygonSolanaMulti-ChainOther\n\nOther\n\nEstimated launch time frame*Select one...Within 1 month1-3 months3-6 months6

In [20]:
title_system_template = """
Please summarize the context below in one sentence (no more than 5 words) as the title for the context

Response should be NO MORE THAN 5 words.
"""

title_human_template = """{context}"""

TITLE_PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(human_template),
    ]
)

description_system_template = """
Please summarize the context below in one sentence (no more than 15 words). This will be used as the description of the article in the search results.

Response should be NO MORE THAN 15 words.
"""

description_human_template = """{context}"""

DESCRIPTION_PROMPT = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(human_template),
    ]
)

llm = ChatOpenAI(temperature=0.)
chain_title = LLMChain(llm=llm, prompt=TITLE_PROMPT)
chain_description = LLMChain(llm=llm, prompt=DESCRIPTION_PROMPT)

In [21]:
def extract_first_n_paragraphs(content, num_para=2):

    # Split by two newline characters to denote paragraphs
    paragraphs = content.split('\n\n')
    
    # Return the first num_para paragraphs or whatever is available
    return '\n\n'.join(paragraphs[:num_para])

In [25]:
chain_link_docs_2 = []
for doc in tqdm(chain_link_docs, total=len(chain_link_docs)):
    para = extract_first_n_paragraphs(doc.page_content, num_para=2)
    title = chain_title.predict(context=para)
    description = chain_description.predict(context=para)
    metadata = {
        "title": title,
        "description": description,
        "source": doc.metadata["source"],
        "source_type": "main_website",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    doc.metadata = metadata

    chain_link_docs_2.append(doc)

100%|██████████| 155/155 [03:35<00:00,  1.39s/it]


In [33]:
chain_link_youtube_docs_2 = []
for doc in tqdm(chain_link_youtube_docs, total=len(chain_link_youtube_docs)):
    title = chain_title.predict(context=doc.page_content[:500])
    description = chain_description.predict(context=doc.page_content[:500])
    metadata = {
        "title": title,
        "description": description,
        "source": doc.metadata["source"],
        "source_type": "videos",
    }
    logger.info(f"Title: {title}")
    logger.info(f"Description: {description}")

    doc.metadata = metadata

    chain_link_youtube_docs_2.append(doc)

100%|██████████| 19/19 [00:44<00:00,  2.36s/it]


In [34]:
for d in chain_link_youtube_docs_2:
    print(d.metadata)

{'title': 'The article discusses the concept of composability in the blockchain ecosystem and its impact on decentralized finance.', 'description': 'The article discusses the concept of composability in the blockchain ecosystem and its impact on decentralized finance.', 'source': 'https://www.youtube.com/watch?v=kI4iYM3rAeI', 'source_type': 'videos'}
{'title': 'The discussion in the Chainlink Tech Talk 15 is about tokenized gold, with a focus on Cash and their Cash Gold token, featuring a deep dive by Diaz London, the CTO of Cash, on how they work with Chainlink Proof of Reserve.', 'description': 'The discussion in the Chainlink Tech Talk 15 is about tokenized gold, with a focus on Cash and their Cash Gold token, featuring a deep dive by Diaz London, the CTO of Cash, on how they work with Chainlink Proof of Reserve.', 'source': 'https://www.youtube.com/watch?v=tPURU6Sq2yo', 'source_type': 'videos'}
{'title': 'Foreign music communities use charging stations to rent batteries, buy water 