In [1]:
import sys

sys.path.append('../')

import os
import faiss
import pickle
from chat.utils import CustomeSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
DATA_DIR = '/Users/arshath/play/chainlink-assistant/data'

os.environ['DATA_DIR'] = DATA_DIR



In [17]:
with open('/Users/arshath/play/chainlink-assistant/data/chain_link_main_documents.pkl', 'rb') as f:
    main_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/blog_documents.pkl', 'rb') as f:
    blog_documents = pickle.load(f)

In [18]:
filtered_main_documents = []
for doc in main_documents:
    if not 'https://chain.link/terms' in doc.metadata['source']:
        filtered_main_documents.append(doc)

In [19]:
for doc in filtered_main_documents:
    if 'https://chain.link/terms' in doc.metadata['source']:
        print(doc.metadata['source'])

In [20]:
with open('/Users/arshath/play/chainlink-assistant/data/chain_link_main_documents.pkl', 'wb') as f:
    pickle.dump(filtered_main_documents, f)

In [21]:
with open('/Users/arshath/play/chainlink-assistant/data/tech_documents.pkl', 'rb') as f:
    tech_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/blog_documents.pkl', 'rb') as f:
    blog_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/chain_link_main_documents.pkl', 'rb') as f:
    main_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/chain_link_you_tube_documents.pkl', 'rb') as f:
    you_tube_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/education_documents.pkl', 'rb') as f:
    education_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/stackoverflow_documents.pkl', 'rb') as f:
    stackoverflow_documents = pickle.load(f)

with open('/Users/arshath/play/chainlink-assistant/data/data_documents.pkl', 'rb') as f:
    data_documents = pickle.load(f)

In [22]:
# Create all documents

all_documents = []
all_documents.extend(tech_documents)
all_documents.extend(blog_documents)
all_documents.extend(main_documents)
all_documents.extend(you_tube_documents)
all_documents.extend(education_documents)
all_documents.extend(stackoverflow_documents)

with open('/Users/arshath/play/chainlink-assistant/data/documents.pkl', 'wb') as f:
    pickle.dump(all_documents, f)

In [23]:
# Split documents into chunks for 16k model
full_doc_splitter = CustomeSplitter()
chunked_full_documents = full_doc_splitter.split(all_documents)

splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
split_docs = splitter.split_documents(all_documents)

# Create vectorstore for all documents
vectorstore_all = FAISS.from_documents(split_docs, embedding=OpenAIEmbeddings())

# Split documents into chunks using datadocs
split_docs_data = splitter.split_documents(data_documents)

# Create vectorstore for datadocs
vectorstore_data = FAISS.from_documents(
    split_docs_data, embedding=OpenAIEmbeddings()
)

# Save vectorstores to disk
faiss.write_index(vectorstore_all.index, f"{DATA_DIR}/docs_all.index")
vectorstore_all.index = None
with open(f"{DATA_DIR}/faiss_store_all.pkl", "wb") as f:
    pickle.dump(vectorstore_all, f)

# Save vectorstore_data
faiss.write_index(vectorstore_data.index, f"{DATA_DIR}/docs_data.index")
vectorstore_data.index = None
with open(f"{DATA_DIR}/faiss_store_data.pkl", "wb") as f:
    pickle.dump(vectorstore_data, f)



### Test new search


In [6]:
from chat.utils import get_search_retriever

In [7]:
ret = get_search_retriever()

In [8]:
res = ret.get_relevant_documents("test")

2023-10-13 11:14:06,320 - search.search - INFO - Query: test
INFO:search.search:Query: test


In [9]:
res[0]

{'source': 'https://blog.chain.link/hardhat-starter-kit-typescript/',
 'source_type': 'blog',
 'title': 'The Chainlink Hardhat Starter Kit Now Supports TypeScript',
 'description': 'The Chainlink Hardhat Starter Kit now supports TypeScript, offering new features for developers.'}

In [10]:
res[1]

{'source': 'https://blog.chain.link/testing-chainlink-smart-contracts/',
 'source_type': 'blog',
 'title': 'Testing Chainlink Smart Contracts',
 'description': 'Developers have a couple of options for writing automated tests for Chainlink smart contracts.'}

In [11]:
res[2]

{'source': 'https://blog.chain.link/how-to-create-an-nft-game/',
 'source_type': 'blog',
 'title': 'How To Create an NFT Game',
 'description': 'The tutorial explains how to create a blockchain-based game using NFTs.'}

In [12]:
res[3]

{'source': 'https://blog.chain.link/timelock-smart-contracts/',
 'source_type': 'blog',
 'title': 'How To Build Timelock Smart Contracts',
 'description': 'Learn how to build timelock smart contracts that enable queueing ERC-20 token minting into a time-based window.'}

In [13]:
res[4]

{'source': 'https://blog.chain.link/how-to-build-a-crypto-game/',
 'source_type': 'blog',
 'title': 'How To Build A Crypto Game',
 'description': 'Learn how to build and deploy a full-stack dApp crypto game on the Ethereum Goerli test network.'}

In [14]:
res[5]

{'source': 'https://docs.chain.link/vrf/v2/subscription/examples/test-locally/',
 'source_type': 'technical_document',
 'title': 'Local testing using a Mock contract',
 'description': 'The article discusses local testing using a mock contract for the VRF v2 subscription method.'}

In [15]:
res[6]

{'source': 'https://docs.chain.link/chainlink-automation/compatible-contract-best-practice/',
 'source_type': 'technical_document',
 'title': 'Automation Best Practices',
 'description': 'Best practices for using Chainlink Automation to create secure and reliable Automation-compatible contracts.'}

In [35]:
res[7]

{'source': 'https://docs.chain.link/ccip/test-tokens/',
 'source_type': 'technical_document',
 'title': 'CCIP Test Tokens',
 'description': 'CCIP supports two test tokens for thorough application testing on testnet.'}

In [36]:
res[8]

{'source': 'https://docs.chain.link/vrf/v2/direct-funding/examples/test-locally/',
 'source_type': 'technical_document',
 'title': 'Local testing using a Mock contract',
 'description': 'The article discusses local testing using a mock contract for the VRF v2 guide.'}

In [37]:
res[9]

{'source': 'https://docs.chain.link/getting-started/deploy-your-first-contract?parent=dataStreams/',
 'source_type': 'technical_document',
 'title': 'You can write your first smart contract and run it in your browser without any\nknowledge about Ethereum or blockchains. This guide shows you how easy it is\nto develop smart contracts using the [Solidity\nlanguage](https://soliditylang.org/), a [MetaMask wallet](https://metamask.io)\nand the [Remix Development Environment](https://remix.ethereum.org/). You can\nuse all of these tools in your browser for free with no signup required.',
 'description': 'Develop smart contracts using Solidity language, MetaMask wallet, and Remix Development Environment in the browser.'}

In [38]:
res[10]

{'source': 'https://docs.chain.link/ccip/best-practices/',
 'source_type': 'technical_document',
 'title': 'CCIP Best Practices',
 'description': 'Best practices for deploying secure and reliable cross-chain dApps on mainnet.'}

In [39]:
res[11]

{'source': 'https://docs.chain.link/chainlink-nodes/external-initiators/external-initiators-in-nodes/',
 'source_type': 'technical_document',
 'title': 'Adding External Initiators to Nodes',
 'description': 'This article discusses the process of adding external initiators to nodes in a network.'}

In [40]:
res[12]

{'source': 'https://docs.chain.link/any-api/find-oracle/',
 'source_type': 'technical_document',
 'title': 'Find Existing Jobs',
 'description': 'This page provides guidance on finding an Oracle Job for your API call.'}

In [41]:
res[13]

{'source': 'https://docs.chain.link/chainlink-nodes/external-adapters/node-operators/',
 'source_type': 'technical_document',
 'title': 'Bridges: Adding External Adapters to Nodes',
 'description': 'Adding external adapters to Chainlink nodes is done by creating a bridge in the Node Operators Interface.'}

In [42]:
res[14]

{'source': 'https://docs.chain.link/getting-started/conceptual-overview/',
 'source_type': 'technical_document',
 'title': 'Smart Contract Overview',
 'description': 'An overview of smart contract development and oracle networks.'}

In [43]:
res[15]

{'source': 'https://docs.chain.link/data-feeds/solana/',
 'source_type': 'technical_document',
 'title': 'Data Feeds on Solana',
 'description': 'Chainlink provides data feeds on the Solana network using Off-Chain Reporting (OCR) to aggregate data from centralized and decentralized exchanges, with no dependencies on external blockchain networks like Ethereum.'}

In [44]:
res[16]

{'source': 'https://docs.chain.link/chainlink-automation/register-upkeep-in-contract/',
 'source_type': 'technical_document',
 'title': 'Register Upkeeps Programmatically',
 'description': 'A guide on how to programmatically register upkeeps within a smart contract for interaction and management.'}

In [45]:
res[17]

{'source': 'https://docs.chain.link/resources/create-a-chainlinked-project/',
 'source_type': 'technical_document',
 'title': 'Install Frameworks',
 'description': 'This context is too vague to provide a specific summary.'}

In [46]:
res[18]

{'source': 'https://docs.chain.link/resources/hackathon-resources/',
 'source_type': 'technical_document',
 'title': 'Hackathon Resources',
 'description': 'This article provides resources for hackathons.'}

In [33]:
for i, doc in enumerate(blog_documents):
    print(doc.metadata)

    if i == 25:
        break

{'source': 'https://blog.chain.link/how-to-verify-a-smart-contract-on-etherscan/', 'source_type': 'blog', 'title': 'How To Create an NFT Game', 'description': 'The tutorial explains how to create a blockchain-based game using NFTs.'}
{'source': 'https://blog.chain.link/announcing-the-winner-of-oracle-olympics-1/', 'source_type': 'blog', 'title': 'How To Create an NFT Game', 'description': 'The tutorial explains how to create a blockchain-based game using NFTs.'}
{'source': 'https://blog.chain.link/how-to-create-an-nft-game/', 'source_type': 'blog', 'title': 'Announcing the Chainlink Fall 2021 Hackathon Winners', 'description': 'Chainlink Fall 2021 Hackathon winners announced, with over 7,800 signups and $550k in prizes awarded.'}
{'source': 'https://blog.chain.link/chainlink-fall-2021-hackathon-winners/', 'source_type': 'blog', 'title': 'CCIP Officially Launches on Mainnet', 'description': "Chainlink's Cross-Chain Interoperability Protocol (CCIP) has launched on mainnet with support fr

In [2]:
import ingest
from ingest.blogs import scrap_blogs, get_blog_urls, fetch_url_content, get_blog
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from tqdm import tqdm

In [3]:
blog_documents = scrap_blogs()

2023-10-13 09:54:18,992 - ingest.blogs - INFO - Scraped 10 blog urls
INFO:ingest.blogs:Scraped 10 blog urls
2023-10-13 09:54:19,653 - ingest.blogs - INFO - Scraped 20 blog urls
INFO:ingest.blogs:Scraped 20 blog urls
2023-10-13 09:54:20,843 - ingest.blogs - INFO - Scraped 50 blog urls
INFO:ingest.blogs:Scraped 50 blog urls
2023-10-13 09:54:22,560 - ingest.blogs - INFO - Scraped 90 blog urls
INFO:ingest.blogs:Scraped 90 blog urls
2023-10-13 09:55:50,607 - ingest.blogs - ERROR - Failed to click on 'load more'. Error: Message: 

ERROR:ingest.blogs:Failed to click on 'load more'. Error: Message: 

2023-10-13 09:55:50,609 - ingest.blogs - INFO - Total number of blog urls: 120
INFO:ingest.blogs:Total number of blog urls: 120
100%|██████████| 120/120 [02:14<00:00,  1.12s/it]
100%|██████████| 120/120 [01:52<00:00,  1.07it/s]
2023-10-13 09:59:57,013 - ingest.blogs - INFO - Scraped blog posts
INFO:ingest.blogs:Scraped blog posts


In [5]:
for i, doc in enumerate(blog_documents):
    print(doc.metadata['title'])

    if i == 25:
        break

What Are ABI and Bytecode in Solidity?
How To Build Dynamic ERC-721 NFTs Using Three Chainlink Trust-Minimized Services
How To Build and Deploy a Smart Contract on Arbitrum
How To Build A Crypto Game
How to Build a Blockchain Lottery
How To Create a BEP-20 Token on BNB Chain
How To Verify a Smart Contract on Etherscan
How To Create an NFT Game
CCIP Officially Launches on Mainnet
Build a Marine Insurance Smart Contract With Chainlink
Blockchain Voting Using a Chainlink Alarm Clock Oracle
Congratulations to the Winners of the Chainlink Virtual Hackathon 2020
How To Create NFT Domain Names
Easily Sell Your APIs and Data to Any Blockchain via Chainlink
The Chainlink Fall 2022 Hackathon: Learn. Build. Win.
How to Build and Deploy an Avalanche Smart Contract
How to Build and Deploy a Solana Smart Contract
Announcing the Winner of Oracle Olympics #1
Why Launch a Crypto Startup? Web3 Opportunities and GTM Strategies
How To Read A Smart Contract On Etherscan
The Top Ways for Smart Contract Deve

In [2]:
from ingest.blogs import scrap_blogs

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 116
Current browser version is 118.0.5993.88 with binary path /Applications/Google Chrome.app/Contents/MacOS/Google Chrome
Stacktrace:
0   chromedriver                        0x0000000104be265c chromedriver + 4318812
1   chromedriver                        0x0000000104bdad00 chromedriver + 4287744
2   chromedriver                        0x000000010480c7ec chromedriver + 296940
3   chromedriver                        0x000000010483a980 chromedriver + 485760
4   chromedriver                        0x0000000104836514 chromedriver + 468244
5   chromedriver                        0x0000000104833ca0 chromedriver + 457888
6   chromedriver                        0x0000000104873954 chromedriver + 719188
7   chromedriver                        0x0000000104873120 chromedriver + 717088
8   chromedriver                        0x000000010483e178 chromedriver + 500088
9   chromedriver                        0x000000010483efc0 chromedriver + 503744
10  chromedriver                        0x0000000104ba2c40 chromedriver + 4058176
11  chromedriver                        0x0000000104ba7160 chromedriver + 4075872
12  chromedriver                        0x0000000104b6ae68 chromedriver + 3829352
13  chromedriver                        0x0000000104ba7c4c chromedriver + 4078668
14  chromedriver                        0x0000000104b7ff08 chromedriver + 3915528
15  chromedriver                        0x0000000104bc4140 chromedriver + 4194624
16  chromedriver                        0x0000000104bc42c4 chromedriver + 4195012
17  chromedriver                        0x0000000104bd44d0 chromedriver + 4261072
18  libsystem_pthread.dylib             0x00000001852e7fa8 _pthread_start + 148
19  libsystem_pthread.dylib             0x00000001852e2da0 thread_start + 8
