In [2]:
import pickle
import requests

import sys
sys.path.append("../")


In [3]:
from datetime import datetime
import time

In [4]:
# get todays data in this format 1609459200
int(datetime.now().timestamp())

1695096350

### Test ingest call

In [6]:
res = requests.post("http://localhost:8000/ingest", headers={"Authorization": "Bearer 1234"})

### Test search call

In [5]:
res = requests.post("http://localhost:8000/search", json={"query": "aave/usd"}, headers={"Authorization": "Bearer 1234"})

res.raise_for_status()

res.json()

{'results': [{'title': 'AAVE / USD on Ethereum Mainnet',
   'description': 'Details for AAVE / USD on Ethereum Mainnet',
   'source_type': 'data',
   'source': 'https://data.chain.link/ethereum/mainnet/crypto-usd/aave-usd'},
  {'title': 'AAVE / USD on Polygon Mainnet',
   'description': 'Details for AAVE / USD on Polygon Mainnet',
   'source_type': 'data',
   'source': 'https://data.chain.link/polygon/mainnet/crypto-eth/aave-usd'},
  {'source': 'https://docs.chain.link/data-feeds/price-feeds/addresses/',
   'source_type': 'technical_document',
   'title': 'Price Feed Contract Addresses',
   'description': 'Addresses for price feed contracts on Ethereum Mainnet, Sepolia Testnet, and Goerli Testnet.'},
  {'source': 'https://docs.chain.link/data-feeds/rates-feeds/addresses/',
   'source_type': 'technical_document',
   'title': 'Rate and Volatility Feed Addresses',
   'description': 'Addresses for accessing rate and volatility feed on the Sepolia Testnet.'},
  {'source': 'https://docs.chai

### Test refresh

In [3]:
res = requests.post("http://localhost:8000/refresh", headers={"Authorization": "Bearer 1234"})

res.raise_for_status()

res.json()

{'message': 'Refreshed.'}

### Other tests

In [26]:
from search.search import SearchRetriever
from config import ROOT_DIR, get_logger

logger = get_logger(__name__)

In [27]:
folder = f'{ROOT_DIR}/data'
# Open blogs document
with open(f"{folder}/blog_documents.pkl", "rb") as f:
    blog_documents = pickle.load(f)

# Open technical documents
with open(f"{folder}/tech_documents.pkl", "rb") as f:
    technical_documents = pickle.load(f)

# data documents
with open(f"{folder}/data_documents.pkl", "rb") as f:
    data_documents = pickle.load(f)

# chain.link documents
with open(f"{folder}/chain_link_main_documents.pkl", "rb") as f:
    chain_link_documents = pickle.load(f)

# chainlink youtube documents
with open(f"{folder}/chain_link_you_tube_documents.pkl", "rb") as f:
    chain_link_youtube_documents = pickle.load(f)


chainlink_search_retrevier = SearchRetriever.from_documents(
    blog_docs=blog_documents,
    tech_docs=technical_documents,
    data_docs=data_documents,
    chain_link_docs=chain_link_documents,
    chain_link_youtube_docs=chain_link_youtube_documents,
    k_final=20, 
    logger=logger
)

In [29]:
chainlink_search_retrevier.get_relevant_documents("eth/usd", type_='all')

2023-09-09 15:07:26,192 - search.search - INFO - Query: eth/usd
INFO:search.search:Query: eth/usd


[{'title': 'ETH / USD on Ethereum Mainnet',
  'description': 'Details for ETH / USD on Ethereum Mainnet',
  'source_type': 'data',
  'source': 'https://data.chain.link/ethereum/mainnet/crypto-usd/eth-usd'},
 {'title': 'stETH / USD on Ethereum Mainnet',
  'description': 'Details for stETH / USD on Ethereum Mainnet',
  'source_type': 'data',
  'source': 'https://data.chain.link/ethereum/mainnet/crypto-usd/steth-usd'},
 {'source': 'https://blog.chain.link/introducing-the-chainlink-on-chain-data-directory/',
  'source_type': 'blog',
  'title': 'Introducing the Chainlink On-Chain Data Directory: Data.eth',
  'description': 'Chainlink introduces the Chainlink On-Chain Data Directory, Data.eth, to enhance security features for DeFi applications.'},
 {'source': 'https://docs.chain.link/data-feeds/ens/',
  'source_type': 'technical_document',
  'title': 'Using ENS with Data Feeds',
  'description': 'This article provides an overview of using ENS (Ethereum Name Service) with data feeds.'},
 {'so

## Test Individually

### Test technical documents

In [2]:
from ingest.docs import scrap_docs

2023-09-08 10:45:53,613 - config - INFO - SO_KEY: RBoPt4BA20Ned8AO8Fna*A((


In [6]:
docs = scrap_docs()

100%|██████████| 261/261 [04:23<00:00,  1.01s/it]
2023-09-08 10:56:26,969 - ingest.docs - INFO - Scraped technical documents.


In [14]:
with open("./data/tech_documents.pkl", "rb") as f:
    docs_loaded = pickle.load(f)

In [34]:
docs_loaded[0].metadata

{'source': 'https://docs.chain.link/any-api/get-request/introduction/',
 'source_type': 'technical_document',
 'title': 'Make a GET Request',
 'description': 'This article provides information on making a GET request, including examples and handling different types of responses.'}

In [15]:
# Make sure doc is unique

contents = []

for doc in docs_loaded:
    if doc.page_content in contents:
        print("Duplicate")
    else:
        contents.append(doc.page_content)

### Test blog documents (TODO:)

In [3]:
from ingest.blogs import scrap_blogs, get_blog_urls, fetch_url_content, to_markdown, get_description_chain, remove_prefix_text
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import re


2023-09-08 16:25:57,870 - config - INFO - SO_KEY: RBoPt4BA20Ned8AO8Fna*A((


In [25]:
# urls = get_blog_urls()
# MAX_WORKERS = 10
# # Use concurrent.futures to parallelize the fetching of URLs
# with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
#     soups = list(tqdm(executor.map(fetch_url_content, urls), total=len(urls)))

# unsuccessful_urls = [url for url, soup in soups if not soup]
# successful_soups = [(url, soup) for url, soup in soups if soup]

# # Use concurrent.futures to parallelize the markdown conversion
# with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
#     blogs = list(tqdm(executor.map(to_markdown, successful_soups), total=len(successful_soups)))

# chain_description = get_description_chain()
# blogs_documents = []
# for url, markdown in tqdm(blogs, total=len(blogs)):
#     markdown_content = remove_prefix_text(markdown)
#     titles = re.findall(r'^#\s(.+)$', markdown_content, re.MULTILINE)
#     title = titles[0].strip()       

2023-09-08 15:09:29,390 - ingest.blogs - INFO - Scraped 10 blog urls
2023-09-08 15:09:31,882 - ingest.blogs - INFO - Scraped 50 blog urls
2023-09-08 15:09:35,802 - ingest.blogs - INFO - Scraped 110 blog urls
2023-09-08 15:09:46,242 - ingest.blogs - INFO - Scraped 170 blog urls
2023-09-08 15:09:49,984 - ingest.blogs - INFO - Scraped 230 blog urls
2023-09-08 15:10:00,283 - ingest.blogs - INFO - Scraped 330 blog urls
2023-09-08 15:10:07,607 - ingest.blogs - INFO - Scraped 430 blog urls
2023-09-08 15:11:44,713 - ingest.blogs - ERROR - Failed to click on 'load more'. Error: Message: 



In [4]:
blogs = scrap_blogs()

2023-09-08 16:26:05,402 - ingest.blogs - INFO - Scraped 10 blog urls
2023-09-08 16:26:08,514 - ingest.blogs - INFO - Scraped 60 blog urls
2023-09-08 16:26:11,135 - ingest.blogs - INFO - Scraped 100 blog urls
2023-09-08 16:26:18,096 - ingest.blogs - INFO - Scraped 180 blog urls
2023-09-08 16:26:22,860 - ingest.blogs - INFO - Scraped 270 blog urls
2023-09-08 16:26:29,743 - ingest.blogs - INFO - Scraped 370 blog urls
2023-09-08 16:26:37,134 - ingest.blogs - INFO - Scraped 470 blog urls
2023-09-08 16:28:08,280 - ingest.blogs - ERROR - Failed to click on 'load more'. Error: Message: 

  0%|          | 0/510 [00:00<?, ?it/s]2023-09-08 16:28:20,330 - ingest.blogs - ERROR - Error scraping https://blog.chain.link/zero-knowledge-rollup/: object of type 'NoneType' has no len()
2023-09-08 16:28:21,753 - ingest.blogs - ERROR - Error scraping https://blog.chain.link/chainlink-hackathon-champions-reveal-their-winning-projects/: object of type 'NoneType' has no len()
2023-09-08 16:28:22,446 - ingest.b

KeyboardInterrupt: 

In [16]:
with open("./data/blog_documents.pkl", "rb") as f:
    blogs_loaded = pickle.load(f)

In [17]:
# Make sure all blogs are unique
contents = []
for blog in blogs_loaded:
    if blog.page_content in contents:
        print("Duplicate")
    else:
        contents.append(blog.page_content)

### Test chain.link

In [4]:
from ingest.chain_link import scrap_chain_link

2023-09-08 12:28:09,898 - config - INFO - SO_KEY: RBoPt4BA20Ned8AO8Fna*A((


In [None]:
docs = scrap_chain_link()

In [8]:
len(docs[0]), len(docs[1])

(152, 19)

In [13]:
with open('/home/marshath/play/chainlink/chainlink-assistant/data/today/chain_link_main_documents.pkl', 'rb') as f:
    docs_loaded_0 = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/data/today/chain_link_you_tube_documents.pkl', 'rb') as f:
    blogs_loaded_1 = pickle.load(f)

len(docs_loaded_0), len(blogs_loaded_1)

(155, 19)

In [18]:
with open('/home/marshath/play/chainlink/chainlink-assistant/nbs/data/chain_link_main_documents.pkl', 'rb') as f:
    docs_loaded_0 = pickle.load(f)

with open('/home/marshath/play/chainlink/chainlink-assistant/nbs/data/chain_link_you_tube_documents.pkl', 'rb') as f:
    blogs_loaded_1 = pickle.load(f)

len(docs_loaded_0), len(blogs_loaded_1)

(152, 19)

In [37]:
docs_loaded_0[0].metadata

{'source': 'https://chain.link/resources/blockchain-oracle-security/',
 'title': 'The Ultimate Guide to Blockchain Oracle Security',
 'description': 'The guide discusses security risks in blockchain oracles and how the right infrastructure can mitigate them.',
 'source_type': 'main'}

In [38]:
blogs_loaded_1[0].metadata

{'source': 'https://www.youtube.com/watch?v=flHWMXLDHD4',
 'source_type': 'video',
 'title': 'How bZx Automates Smart Contracts with Chainlink Automation | Chainlink Tech Talk #1',
 'description': 'BZX automates their smart contracts with Chainlink Keepers to overcome the limitations of smart contracts not being fully automated.'}

In [19]:
# Make sure all doc and blogs are unique

contents = []
for doc in docs_loaded_0:
    if doc.page_content in contents:
        print("Duplicate")
    else:
        contents.append(doc.page_content)

contents = []
for blog in blogs_loaded_1:
    if blog.page_content in contents:
        print("Duplicate")
    else:
        contents.append(blog.page_content)

Duplicate
Duplicate
Duplicate
Duplicate
Duplicate


### Test eduction

In [17]:
from ingest.education import scrap_education_docs

In [18]:
docs = scrap_education_docs()

Cloning into '.'...
2023-09-08 12:33:49,740 - ingest.education - INFO - Scrapped chainlink education documents.


In [20]:
len(docs)

51

In [20]:
with open('/home/marshath/play/chainlink/chainlink-assistant/nbs/data/education_documents.pkl', 'rb') as f:
    edu_docs = pickle.load(f)

In [40]:
edu_docs[0].metadata

{'source': 'https://github.com/oceanByte/chainlink-education/blob/a0b8886bd664423b40c8bd3661fdb7d61e975ea2/src/api/src/shared/course/courses/vrfIntroduction/Chapters/Chapter-6/course.md',
 'type': 'education'}

In [21]:
contents = []
for doc in edu_docs:
    if doc.page_content in contents:
        print("Duplicate")
    else:
        contents.append(doc.page_content)

### Test stackoverflow

In [24]:
from ingest.stackoverflow import scrap_stackoverflow
from ingest_script import get_access_token

In [25]:
access_token = get_access_token()

2023-09-08 12:35:08,112 - ingest_script - INFO - Initial get code response: 302
2023-09-08 12:35:08,113 - ingest_script - INFO - Initial get code response headers: /users/login?returnurl=https%3a%2f%2fstackoverflow.com%2foauth%3fclient_id%3d27148%26scope%3dread_inbox%26redirect_uri%3dhttp%3a%2f%2flocalhost
2023-09-08 12:35:09,457 - ingest_script - INFO - Login response: 200
2023-09-08 12:35:09,710 - ingest_script - INFO - Second get code response: 302
2023-09-08 12:35:09,711 - ingest_script - INFO - Second get code response headers: http://localhost/?code=ib02w7aDzomkYkI2WA302w))


In [27]:
access_token['access_token']

'Wu(beJ5kE1fbgm7MvjyEDg))'

In [28]:
docs = scrap_stackoverflow(access_token=access_token['access_token'])

2023-09-08 12:35:30,784 - ingest.stackoverflow - INFO - Key: RBoPt4BA20Ned8AO8Fna*A((
2023-09-08 12:35:30,788 - ingest.stackoverflow - INFO - Wu(beJ5kE1fbgm7MvjyEDg))
100%|██████████| 154/154 [02:50<00:00,  1.11s/it]
2023-09-08 12:38:40,762 - ingest.stackoverflow - INFO - Scrapped stackoverflow
2023-09-08 12:38:40,763 - ingest.stackoverflow - INFO - First document SO: page_content='Question: (Asked on: 2023-05-15 09:46:33)\nFind price feed contracts Addresses of chainlink within smart contracts\nURL: https://stackoverflow.com/questions/76252825/find-price-feed-contracts-addresses-of-chainlink-within-smart-contracts\n\nQuestion Body:\nI am developing a contract in which I have to check whether the incoming token\nis paired with USD to get the price feed of Chainlink from the smart contract.\nOnly such tokens are going to be added to my smart contract. Does Chainlink\nhave any functions to get the verified or registered list of pairs from a\nsmart contract?\n\nI have gone through <https:

In [31]:
docs[0].metadata

{'source': 'https://stackoverflow.com/questions/76252825/find-price-feed-contracts-addresses-of-chainlink-within-smart-contracts',
 'type': 'stackoverflow'}

### Test data

In [4]:
from ingest.data import scrap_data

2023-09-08 14:23:23,393 - config - INFO - SO_KEY: RBoPt4BA20Ned8AO8Fna*A((


In [5]:
docs = scrap_data()

Base URLs:   0%|          | 0/11 [00:00<?, ?it/s]2023-09-08 14:23:23,433 - ingest.data - INFO - Scraping https://data.chain.link/ethereum/mainnet
2023-09-08 14:24:37,318 - ingest.data - INFO - Total links: 175
2023-09-08 14:26:44,326 - ingest.data - INFO - Scraping https://data.chain.link/ethereum/mainnet done
Base URLs:   9%|▉         | 1/11 [03:20<33:28, 200.89s/it]2023-09-08 14:26:44,328 - ingest.data - INFO - Scraping https://data.chain.link/polygon/mainnet
2023-09-08 14:27:57,352 - ingest.data - INFO - Total links: 95
2023-09-08 14:29:18,236 - ingest.data - INFO - Scraping https://data.chain.link/polygon/mainnet done
Base URLs:  18%|█▊        | 2/11 [05:54<25:59, 173.26s/it]2023-09-08 14:29:18,240 - ingest.data - INFO - Scraping https://data.chain.link/optimism/mainnet
2023-09-08 14:30:30,186 - ingest.data - INFO - Total links: 85
2023-09-08 14:31:25,655 - ingest.data - INFO - Scraping https://data.chain.link/optimism/mainnet done
Base URLs:  27%|██▋       | 3/11 [08:02<20:18, 152

In [6]:
from IPython.display import display, HTML, Markdown

In [7]:
Markdown(docs[0].page_content)

The following is the details for the pair RAI / USD which operates on the Ethereum Mainnet. This asset is named "RAI Reflex Index". and falls under the "Crypto" asset class. It has a tier status of "Monitored". The deviation threshold for this asset is set at 0.5%. 19 / 19 oracles carries and support this asset. You can find its contract at the address "0x483d36f6a1d063d580c7a24f9a42b346f3a69fbb , and its ENS address is "rai-usd.data.eth".

In [8]:
Markdown(docs[1].page_content)

The following is the details for the pair XCN / USD which operates on the Ethereum Mainnet. This asset is named "Chain". and falls under the "Crypto" asset class. It has a tier status of "Verified". The deviation threshold for this asset is set at 2%. 16 / 16 oracles carries and support this asset. You can find its contract at the address "0xeb988b77b94c186053282bfcd8b7ed55142d3cab , and its ENS address is "xcn-usd.data.eth".

In [9]:
docs[0].metadata

{'title': 'RAI / USD on Ethereum Mainnet',
 'description': 'Details for RAI / USD on Ethereum Mainnet',
 'source_type': 'data',
 'source': 'https://data.chain.link/ethereum/mainnet/stablecoins/rai-usd'}

In [11]:
len(docs)

658

In [12]:
# make source all title is unique
titles = []
for doc in docs:
    if doc.metadata['title'] in titles:
        raise Exception("Title already exists")
    titles.append(doc.metadata['title'])