In [52]:
import re
import bs4
import time
import pickle
import requests
import html2text
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import concurrent.futures
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Optional, Set
from requests.exceptions import RequestException

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

from langchain.docstore.document import Document

# Settings for requests
MAX_THREADS = 10
REQUEST_DELAY = 0.1
SESSION = requests.Session()

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Set up the webdriver
s=Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=chrome_options)

In [53]:
import time
import logging

logger = logging.getLogger(__name__)

In [54]:
def filter_links(soup, filter_str='/polygon/mainnet/'):
    # Get all links
    links = soup.find_all('a')
    
    # Filter links to only those that are for polygon mainnet
    hrefs = [link.get('href') for link in links]
    filtered_hrefs = [href for href in hrefs if href is not None and filter_str in href and href.count('/') == 4]

    return filtered_hrefs

In [55]:
def get_links(url):
    """
    Get all links from a given url
    """

    filter_sub_url = url.split("link")[1]

    all_links = []
    for i in range(10):
        if i == 0:
            driver.get(url)
            driver.implicitly_wait(7)
            time.sleep(7)

        else:
            driver.find_element(by="xpath", value="/html/body/div[1]/main/section[2]/div/div[2]/button[2]").click()
            driver.implicitly_wait(7)
            time.sleep(7)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        hrefs = filter_links(soup, filter_sub_url)
        all_links.extend(hrefs)
        logger.info(f"Page {i+1} scraped")
    
    # Add base url
    all_links = [f"https://data.chain.link{link}" for link in all_links]
    
    # Remove duplicates
    all_links = list(set(all_links))

    return all_links


In [71]:
def get_details(url):
    driver.get(u)
    driver.implicitly_wait(3)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    details = {}

    # Get the title
    details["pair"] = soup.find("h1").text

    # Get the details
    infos = soup.findAll("p")

    # Match pattern
    match = r'Minimum of (\d+)'

    prev_word = ""
    for info in infos:
        if prev_word == "Asset Name":
            details["asset_name"] = info.text
        elif prev_word == "Asset Class":
            details["asset_class"] = info.text
        elif prev_word == "Tier":
            details["tier"] = info.text
        elif prev_word == "Network":
            details["network"] = info.text
        elif prev_word == "Deviation threshold":
            details["deviation"] = info.text
        if re.search(match, prev_word):
            details["num_oracles"] = info.text
        prev_word = info.text

    try:
        for each in soup.find("div", class_="sc-d6e7e954-0 sc-e3a5e58-0 teTjm"):
            if each.name != "div":
                details["contract_address"] = each.a.text
    except:
        pass

    try:
        for each in soup.find("div", class_="sc-d6e7e954-0 sc-3ba96657-0 sc-b8182c9f-1 hRpMsk iSLEhf"):
                details["ens_address"] = each.div.next_sibling.text
    except:
        pass

    return details

In [97]:
all_base_urls = [
    "https://data.chain.link/ethereum/mainnet",
    "https://data.chain.link/polygon/mainnet",
    "https://data.chain.link/optimism/mainnet",
    "https://data.chain.link/fantom/mainnet",
    "https://data.chain.link/moonriver/mainnet",
    "https://data.chain.link/metis/mainnet",
    "https://data.chain.link/bsc/mainnet",
    "https://data.chain.link/arbitrum/mainnet",
    "https://data.chain.link/avalanche/mainnet",
    "https://data.chain.link/harmony/mainnet",
    "https://data.chain.link/moonbeam/mainnet",
]

eth_urls = get_links(all_base_urls[6])

In [98]:
eth_details = []

for u in tqdm(eth_urls[:10], total=len(eth_urls[:10])):
    try:
        eth_details.append(get_details(u))
    except Exception as e:
        logger.error(f'Failed to get details for {u}')
        logger.error(e)

100%|██████████| 10/10 [00:46<00:00,  4.65s/it]


In [99]:
def make_sentence(details):
    """Make a sentence from the details"""

    first_sentence = """The following is the details for the pair {pair} which operates on the {network}."""
    second_sentence = """This asset is named "{asset_name}".""" 
    third_sentence = """and falls under the "{asset_class}" asset class."""
    fourth_sentence = """It has a tier status of "{tier}".""" 
    fifth_sentence = """The deviation threshold for this asset is set at {deviation}.""" 
    sixth_sentence = """{num_oracles} oracles carries and support this asset.""" 
    seventh_sentence = """You can find its contract at the address "{contract_address}"""
    eigth_sentence = """, and its ENS address is "{ens_address}"."""

    sentence = first_sentence.format(pair=details["pair"], network=details["network"])

    if "asset_name" in details.keys():
        sentence += f" {second_sentence.format(asset_name=details['asset_name'])}"

    if "asset_class" in details.keys():
        sentence += f" {third_sentence.format(asset_class=details['asset_class'])}"

    if "tier" in details.keys():
        sentence += f" {fourth_sentence.format(tier=details['tier'])}"

    if "deviation" in details.keys():
        sentence += f" {fifth_sentence.format(deviation=details['deviation'])}"

    if "num_oracles" in details.keys():
        sentence += f" {sixth_sentence.format(num_oracles=details['num_oracles'])}"

    if "contract_address" in details.keys():
        sentence += f" {seventh_sentence.format(contract_address=details['contract_address'])}"

    if "ens_address" in details.keys():
        sentence += f" {eigth_sentence.format(ens_address=details['ens_address'])}"

    return sentence

In [100]:
sentences = []

for det in eth_details:
    sentences.append(make_sentence(det))

In [101]:
sentences

['The following is the details for the pair FXS / USD which operates on the BNB Chain Mainnet. This asset is named "Frax Share". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x0e9d55932893fb1308882c7857285b2b0bcc4f4a',
 'The following is the details for the pair BAND / BNB which operates on the BNB Chain Mainnet. This asset is named "Band Protocol". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 0.5%. 15 / 15 oracles carries and support this asset. You can find its contract at the address "0x3334bf7ec892ca03d1378b51769b7782eaf318c4',
 'The following is the details for the pair MASK / USD which operates on the BNB Chain Mainnet. This asset is named "Mask Network". and falls under the "Crypto" asset class. It has a tier status of "Ver