In [1]:
from IPython.display import display, HTML, Markdown

In [10]:
import re
import bs4
import time
import json
import pickle
import requests
import html2text
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import concurrent.futures
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Optional, Set, Dict, Tuple, Union, Any
from requests.exceptions import RequestException

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from youtube_transcript_api import (
    NoTranscriptFound,
    TranscriptsDisabled,
    YouTubeTranscriptApi,
)

from langchain.docstore.document import Document
from langchain.document_loaders import YoutubeLoader
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# Settings for requests
MAX_THREADS = 10
REQUEST_DELAY = 0.1
SESSION = requests.Session()

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Set up the webdriver
s=Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=chrome_options)

In [11]:
import time
import logging

logger = logging.getLogger(__name__)

### Data

In [54]:
def filter_links(soup, filter_str='/polygon/mainnet/'):
    # Get all links
    links = soup.find_all('a')
    
    # Filter links to only those that are for polygon mainnet
    hrefs = [link.get('href') for link in links]
    filtered_hrefs = [href for href in hrefs if href is not None and filter_str in href and href.count('/') == 4]

    return filtered_hrefs

In [55]:
def get_links(url):
    """
    Get all links from a given url
    """

    filter_sub_url = url.split("link")[1]

    all_links = []
    for i in range(10):
        if i == 0:
            driver.get(url)
            driver.implicitly_wait(7)
            time.sleep(7)

        else:
            driver.find_element(by="xpath", value="/html/body/div[1]/main/section[2]/div/div[2]/button[2]").click()
            driver.implicitly_wait(7)
            time.sleep(7)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        hrefs = filter_links(soup, filter_sub_url)
        all_links.extend(hrefs)
        logger.info(f"Page {i+1} scraped")
    
    # Add base url
    all_links = [f"https://data.chain.link{link}" for link in all_links]
    
    # Remove duplicates
    all_links = list(set(all_links))

    return all_links


In [71]:
def get_details(url):
    driver.get(u)
    driver.implicitly_wait(3)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    details = {}

    # Get the title
    details["pair"] = soup.find("h1").text

    # Get the details
    infos = soup.findAll("p")

    # Match pattern
    match = r'Minimum of (\d+)'

    prev_word = ""
    for info in infos:
        if prev_word == "Asset Name":
            details["asset_name"] = info.text
        elif prev_word == "Asset Class":
            details["asset_class"] = info.text
        elif prev_word == "Tier":
            details["tier"] = info.text
        elif prev_word == "Network":
            details["network"] = info.text
        elif prev_word == "Deviation threshold":
            details["deviation"] = info.text
        if re.search(match, prev_word):
            details["num_oracles"] = info.text
        prev_word = info.text

    try:
        for each in soup.find("div", class_="sc-d6e7e954-0 sc-e3a5e58-0 teTjm"):
            if each.name != "div":
                details["contract_address"] = each.a.text
    except:
        pass

    try:
        for each in soup.find("div", class_="sc-d6e7e954-0 sc-3ba96657-0 sc-b8182c9f-1 hRpMsk iSLEhf"):
                details["ens_address"] = each.div.next_sibling.text
    except:
        pass

    return details

In [97]:
all_base_urls = [
    "https://data.chain.link/ethereum/mainnet",
    "https://data.chain.link/polygon/mainnet",
    "https://data.chain.link/optimism/mainnet",
    "https://data.chain.link/fantom/mainnet",
    "https://data.chain.link/moonriver/mainnet",
    "https://data.chain.link/metis/mainnet",
    "https://data.chain.link/bsc/mainnet",
    "https://data.chain.link/arbitrum/mainnet",
    "https://data.chain.link/avalanche/mainnet",
    "https://data.chain.link/harmony/mainnet",
    "https://data.chain.link/moonbeam/mainnet",
]

eth_urls = get_links(all_base_urls[6])

In [98]:
eth_details = []

for u in tqdm(eth_urls[:10], total=len(eth_urls[:10])):
    try:
        eth_details.append(get_details(u))
    except Exception as e:
        logger.error(f'Failed to get details for {u}')
        logger.error(e)

100%|██████████| 10/10 [00:46<00:00,  4.65s/it]


In [99]:
def make_sentence(details):
    """Make a sentence from the details"""

    first_sentence = """The following is the details for the pair {pair} which operates on the {network}."""
    second_sentence = """This asset is named "{asset_name}".""" 
    third_sentence = """and falls under the "{asset_class}" asset class."""
    fourth_sentence = """It has a tier status of "{tier}".""" 
    fifth_sentence = """The deviation threshold for this asset is set at {deviation}.""" 
    sixth_sentence = """{num_oracles} oracles carries and support this asset.""" 
    seventh_sentence = """You can find its contract at the address "{contract_address}"""
    eigth_sentence = """, and its ENS address is "{ens_address}"."""

    sentence = first_sentence.format(pair=details["pair"], network=details["network"])

    if "asset_name" in details.keys():
        sentence += f" {second_sentence.format(asset_name=details['asset_name'])}"

    if "asset_class" in details.keys():
        sentence += f" {third_sentence.format(asset_class=details['asset_class'])}"

    if "tier" in details.keys():
        sentence += f" {fourth_sentence.format(tier=details['tier'])}"

    if "deviation" in details.keys():
        sentence += f" {fifth_sentence.format(deviation=details['deviation'])}"

    if "num_oracles" in details.keys():
        sentence += f" {sixth_sentence.format(num_oracles=details['num_oracles'])}"

    if "contract_address" in details.keys():
        sentence += f" {seventh_sentence.format(contract_address=details['contract_address'])}"

    if "ens_address" in details.keys():
        sentence += f" {eigth_sentence.format(ens_address=details['ens_address'])}"

    return sentence

In [100]:
sentences = []

for det in eth_details:
    sentences.append(make_sentence(det))

In [101]:
sentences

['The following is the details for the pair FXS / USD which operates on the BNB Chain Mainnet. This asset is named "Frax Share". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x0e9d55932893fb1308882c7857285b2b0bcc4f4a',
 'The following is the details for the pair BAND / BNB which operates on the BNB Chain Mainnet. This asset is named "Band Protocol". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x3334bf7ec892ca03d1378b51769b7782eaf318c4',
 'The following is the details for the pair MASK / USD which operates on the BNB Chain Mainnet. This asset is named "Mask Network". and falls under the "Crypto" asset class. It has a tier status of "Ver

### chain.links

In [4]:
def filter_urls_by_base_url(urls:List, base_url:str):
    """
    Filters a list of URLs and returns only those that include the base_url.

    :param urls: List of URLs to filter.
    :param base_url: Base URL to filter by.
    :return: List of URLs that include the base_url.
    """
    return [url for url in urls if base_url in url]

def normalize_url(url:str):
    """
    Normalize a URL by ensuring it ends with '/'.

    :param url: URL to normalize.
    :return: Normalized URL.
    """
    return url if url.endswith('/') else url + '/'

def fetch_url_request(url:str):
    """
    Fetches the content of a URL using requests library and returns the response.
    In case of any exception during fetching, logs the error and returns None.

    :param url: URL to fetch.
    :return: Response object on successful fetch, None otherwise.
    """
    try:
        response = SESSION.get(url)
        response.raise_for_status()
        return response
    except RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def fetch_url_selenium(url:str):
    """
    Fetches the content of a URL using Selenium and returns the source HTML of the page.
    In case of any exception during fetching, logs the error and returns None.

    :param url: URL to fetch.
    :return: HTML source as a string on successful fetch, None otherwise.
    """
    try:
        driver.get(url)
        driver.implicitly_wait(7)
        time.sleep(7)
        return driver.page_source
    
    except RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def process_url(response:requests.Response, visited:Set, base_url:str):
    """
    Process a URL response. Extract all absolute URLs from the response that 
    haven't been visited yet and belong to the same base_url.

    :param response: Response object from a URL fetch.
    :param visited: Set of URLs already visited.
    :param base_url: Base URL to filter by.
    :return: Set of new URLs to visit.
    """
    urls = set()
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        for link in soup.find_all('a'):
            href = link.get('href')
            if href is not None and '#' not in href:
                absolute_url = normalize_url(urljoin(response.url, href))
                if absolute_url not in visited and base_url in absolute_url:
                    visited.add(absolute_url)
                    urls.add(absolute_url)
    return urls

def get_all_suburls(url:str, visited:Optional[Set]=None):
    """
    Get all sub-URLs of a given URL that belong to the same domain.

    :param url: Base URL to start the search.
    :param visited: Set of URLs already visited.
    :return: Set of all sub-URLs.
    """
    if visited is None:
        visited = set()

    if not url.startswith("http"):
        url = "https://" + url

    base_url = url.split("//")[1].split("/")[0]
    urls = set()

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        future_responses = [executor.submit(fetch_url_request, url)]
        while future_responses:
            for future in concurrent.futures.as_completed(future_responses):
                future_responses.remove(future)
                response = future.result()
                new_urls = process_url(response, visited, base_url)
                urls.update(new_urls)
                if len(future_responses) < MAX_THREADS:
                    for new_url in new_urls:
                        future_responses.append(executor.submit(fetch_url_request, new_url))

    urls = filter_urls_by_base_url(urls, base_url)
    return urls

In [38]:
def is_there_video(soup:BeautifulSoup) -> List[str]:
    """Check if there is a video in the soup
    params:
        soup: BeautifulSoup object
    returns:
        video_links: List of video links
    """
    iframes = soup.find_all('iframe')
    video_links = []
    for iframe in iframes:
        src = iframe.get('src')
        if 'youtube' in src:
            video_links.append(src)

    return video_links

def get_youtube_docs(video_tags:List[str], chain_description) -> List[Document]:
    """Get youtube docs from the video tags
    params:
        video_tags: List of video tags
    returns:
        u_tube_docs: List of youtube docs
    """
    if video_tags:
        u_tube_docs = []
        for v_tag in video_tags:
            try:
                u_tube = json.loads(v_tag.script.string)["items"][0]["url"]
                u_tube_id = YoutubeLoader.extract_video_id(u_tube)
                u_tube_doc = YoutubeLoader(u_tube_id, add_video_info=True).load()[0]

                # Get description
                description = chain_description.predict(context=u_tube_doc.page_content[:1500])

                # Make sure its Chainlink video
                assert u_tube_doc.metadata["author"].lower() == "chainlink"
                metadata = {
                    "source":u_tube, 
                    "source_type":"video", 
                    "title":u_tube_doc.metadata["title"],
                    "description":description}

                # Update the metadata
                u_tube_doc.metadata = metadata

                # Append to the list
                u_tube_docs.append(u_tube_doc)

            except Exception as e:
                print(e)
                u_tube_doc = []
    else:
        u_tube_docs = []
    return u_tube_docs

In [39]:
def remove_prefix_text(markdown):
    # Split the content at the first title
    parts = re.split(r'^(#\s.+)$', markdown, maxsplit=1, flags=re.MULTILINE)

    # If a split occurred, then take the content from the first title onward
    new_text = parts[-2] + parts[-1] if len(parts) > 1 else markdown

    return new_text


def extract_first_n_paragraphs(content, num_para=2):

    # Split by two newline characters to denote paragraphs
    paragraphs = content.split('\n\n')
    
    # Return the first num_para paragraphs or whatever is available
    return '\n\n'.join(paragraphs[:num_para])


def scrap_url(url: str,  chain_description:LLMChain, driver: webdriver.Chrome=driver) -> Document:
    """Process a URL and return a list of words
    param url: URL to process
    param driver: Selenium driver
    return: Document object
    """
    driver.get(url)
    driver.implicitly_wait(2)
    time.sleep(2)

    # Get the page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Get the Markdown content
    # Remove images, videos, SVGs, and other media elements; also nav
    for media_tag in soup.find_all(['img', 'video', 'svg', 'audio', 'source', 'track', 'picture', 'nav']):
        media_tag.decompose()

    # Remove the footer (assuming it's in a <footer> tag or has a class/id like 'footer')
    for footer_tag in soup.find_all(['footer', {'class': 'footer'}, {'id': 'footer'}]):
        footer_tag.decompose()

    # Remove sections with class="section-page-alert"
    for page_alert in soup.find_all('div', class_='section-page-alert'):
        page_alert.decompose()

    # Remove sections with class="cta-subscribe"
    for cta_subscribe in soup.find_all(class_='cta-subscribe'):
        cta_subscribe.decompose()
        

    html_content = str(soup)
    h = html2text.HTML2Text()
    markdown_content = h.handle(html_content)

    # Remove the prefix
    markdown_content = remove_prefix_text(markdown_content)

    # Get the title
    titles = re.findall(r'^#\s(.+)$', markdown_content, re.MULTILINE)
    title = titles[0].strip() 

    # Get description
    para = extract_first_n_paragraphs(markdown_content, num_para=2)
    description = chain_description.predict(context=para)
    
    # Put the markdown content into a Document object
    doc = Document(page_content=markdown_content, metadata={
        "source": url, 
        "title": title, 
        "description": description, 
        "source_type": "main"})

    # Get YouTube docs
    video_tags = soup.find_all('a', href=True, class_="techtalk-video-lightbox")
    u_tube_docs = get_youtube_docs(video_tags, chain_description)    

    return doc, u_tube_docs

In [40]:
def get_description_chain():
    system_template = """
    Please summarize the context below in one sentence (no more than 15 words). This will be used as the description of the article in the search results.

    Response should be NO MORE THAN 15 words.
    """

    human_template = """{context}"""

    PROMPT = ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ]
    )

    llm = ChatOpenAI(temperature=0.)
    chain = LLMChain(llm=llm, prompt=PROMPT)

    return chain

In [41]:
def scrap_chain_link() -> Tuple[List[Dict], List[Dict]]:
    """
    Scrap all the urls from https://chain.link/ and save the main docs and you tube docs to disk
    return: Tuple[List[Dict], List[Dict]]
    """

    raw_urls = get_all_suburls("https://chain.link/")
    
    # Only keep urls that start with https://chain.link
    raw_urls = [url for url in raw_urls if url.startswith("https://chain.link")]

    # add chain.link/faqs if not in raw_urls
    if "https://chain.link/faqs" not in raw_urls:
        raw_urls.append("https://chain.link/faqs")

    # Remove duplicates
    raw_urls = list(set(raw_urls))

    # Process urls
    all_main_docs = []
    all_you_tube_docs = []

    # Get description chain
    chain_description = get_description_chain()

    for url in tqdm(raw_urls, total=len(raw_urls)):
        logger.info(f"Processing {url}")
        main_doc, you_tube_docs = scrap_url(url, chain_description)
        all_main_docs.append(main_doc)
        all_you_tube_docs.extend(you_tube_docs)
        logger.info(f"Processed {url}")

    # Save to disk as pickle
    with open(f"chain_link_main_docs_{datetime.now().strftime('%Y-%m-%d')}.pkl", "wb") as f:
        pickle.dump(all_main_docs, f)

    with open(f"chain_link_you_tube_docs_{datetime.now().strftime('%Y-%m-%d')}.pkl", "wb") as f:
        pickle.dump(all_you_tube_docs, f)

    logger.info("Done")

    return all_main_docs, all_you_tube_docs
    

In [42]:
all_main_docs, all_you_tube_docs = scrap_chain_link()

Error fetching mailto:custom@chain.link/: No connection adapters were found for 'mailto:custom@chain.link/'
Error fetching mailto:support@chain.link/: No connection adapters were found for 'mailto:support@chain.link/'
Error fetching https://blog.chain.link/build-deploy-and-sell-your-own-dynamic-nft/: 403 Client Error: Forbidden for url: https://blog.chain.link/build-deploy-and-sell-your-own-dynamic-nft/
Error fetching https://blog.chain.link/smart-contract-use-cases/: 403 Client Error: Forbidden for url: https://blog.chain.link/smart-contract-use-cases/
Error fetching https://blog.chain.link/: 403 Client Error: Forbidden for url: https://blog.chain.link/
Error fetching https://blog.chain.link/apis-smart-contracts-and-how-to-connect-them/: 403 Client Error: Forbidden for url: https://blog.chain.link/apis-smart-contracts-and-how-to-connect-them/
Error fetching https://blog.chain.link/chainlink-chinese-communities/: 403 Client Error: Forbidden for url: https://blog.chain.link/chainlink-ch

[]


  1%|▏         | 2/159 [00:08<10:50,  4.14s/it]

[]
[<a aria-haspopup="dialog" aria-label="open lightbox" class="techtalk-video-lightbox w-inline-block w-lightbox" href="#"><div class="btn btn-primary">Watch now</div><script class="w-json" type="application/json">{
  "items": [
    {
      "url": "https://www.youtube.com/watch?v=K3YmflNXbEc",
      "html": "<iframe class=\"embedly-embed\" src=\"//cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fwww.youtube.com%2Fembed%2FK3YmflNXbEc%3Ffeature%3Doembed&display_name=YouTube&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DK3YmflNXbEc&image=https%3A%2F%2Fi.ytimg.com%2Fvi%2FK3YmflNXbEc%2Fhqdefault.jpg&key=96f1f04c5f4143bcb0f2e68c87d65feb&type=text%2Fhtml&schema=youtube\" width=\"854\" height=\"480\" scrolling=\"no\" title=\"YouTube embed\" frameborder=\"0\" allow=\"autoplay; fullscreen\" allowfullscreen=\"true\"></iframe>",
      "height": 480,
      "width": 854,
      "thumbnailUrl": "https://i.ytimg.com/vi/K3YmflNXbEc/hqdefault.jpg",
      "type": "video"
    }
  ],
  "group": ""
}<

  3%|▎         | 4/159 [00:18<11:43,  4.54s/it]

[]


  3%|▎         | 5/159 [00:21<10:50,  4.22s/it]

[]


  4%|▍         | 6/159 [00:27<11:27,  4.50s/it]

[]


  4%|▍         | 7/159 [00:31<10:59,  4.34s/it]

[]


  5%|▌         | 8/159 [00:35<11:02,  4.39s/it]

### Check all docuemnts

In [85]:
with open('/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_main_docs_2023-08-18.pkl', 'rb') as f:
    chain_link_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/chain_link_you_tube_docs_2023-08-18.pkl', 'rb') as f:
    chain_link_youtube_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/datadocs_2023-08-18.pkl', 'rb') as f:
    data_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/blog_2023-08-18.pkl', 'rb') as f:
    blog_docs = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/techdocs_2023-08-18.pkl', 'rb') as f:
    tech_docs = pickle.load(f)

docs = chain_link_docs + chain_link_youtube_docs + data_docs + blog_docs + tech_docs

### Merge all docs

In [87]:
len(docs)

1509

In [90]:
with open('../data/documents.pkl', 'wb') as f:
    pickle.dump(docs, f)