In [1]:
# import libraries
import os
import re
import requests
import tiktoken  # for counting tokens
import pandas as pd  # for storing text and embeddings data
import string  # for removing punctuation
import unicodedata  # for normalizing text
import urllib.request  # for downloading html
from bs4 import BeautifulSoup  # for parsing html
from collections import deque
from html.parser import HTMLParser
from openai import OpenAI  # for calling the OpenAI API
from scipy import spatial  # for calculating vector similarities for search
from tenacity import (
    retry,
    wait_random_exponential,
    stop_after_attempt,
)  # for retrying API calls
from urllib.parse import urlparse
from utils.embeddings_utils import cosine_similarity

# models
EMBEDDING_MODEL = "text-embedding-3-small"
GPT_MODEL = "gpt-4o"

In [2]:
# constants
domain = "app.bentobatch.com"
full_url = "https://app.bentobatch.com/case"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Crawl Web Pages

Given a list of URLs, crawl each page and return the content of the page.


In [3]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r"^http[s]*://.+"


# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])


# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    user_agent = "Mozilla/5.0"
    headers = {"User-Agent": user_agent}

    # Try to open the URL and read the HTML
    try:
        request = urllib.request.Request(url=url, headers=headers)
        # Open the URL and read the HTML
        with urllib.request.urlopen(request) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get("Content-Type").startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode("utf-8")
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks


# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


def get_content(soup):
    try:
        name = soup.find("h2", class_="chakra-text").text
        description = soup.find("div", class_="css-icjp7f").text
        data = soup.find("div", class_="css-4cxybv")
        tags = [
            span.get_text().lower()
            for span in data.find_all("span", class_="css-1ny2kle")
        ]
        return name, description, tags
    except:
        return None


def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
        os.mkdir("text/")

    if not os.path.exists("text/" + local_domain + "/"):
        os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
        os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url)  # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open(
            "text/" + local_domain + "/" + url[8:].replace("/", "_") + ".txt", "w"
        ) as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the content from the URL
            data = get_content(soup)

            # Write the data to the file
            if data is not None:
                f.write(str(data) + "\n" + str(url))

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

In [4]:
crawl(full_url)

https://app.bentobatch.com/case
https://app.bentobatch.com/case/yearn_v3_usdce
https://app.bentobatch.com/case/gamma_yearn
https://app.bentobatch.com/case/scroll_airdrop_hunting_professional
https://app.bentobatch.com/case/scroll_airdrop_hunting_with_penpad
https://app.bentobatch.com/case/yearn_withdrawal
https://app.bentobatch.com/case/aave_usdc_eth
https://app.bentobatch.com/dashboard
https://app.bentobatch.com/case/ether_fi_liquid
https://app.bentobatch.com/case/etf_ethereum_top10
https://app.bentobatch.com/case/scroll_airdrop_hunting_advanced
https://app.bentobatch.com/case/juice_points
https://app.bentobatch.com/case/zircuit_etherfi
https://app.bentobatch.com/case/zircuit_renzo
https://app.bentobatch.com/case/pendle_points
https://app.bentobatch.com/case/eigenpie_steth
https://app.bentobatch.com/case/aave_eth_eth_emode
https://app.bentobatch.com/case/aave_eth_eth
https://app.bentobatch.com/case/zora_famous_creators
https://app.bentobatch.com/case/ethena_usdt
https://app.bentobatch

# Data Cleaning


In [5]:
# Create a list to store the data records
data = []

# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):

    # Open the file and read the text
    with open("text/" + domain + "/" + file, "r") as f:
        text = f.read()
        if not text:
            continue
        data_text, url = text.split("\n")
        data_tuple = eval(data_text)

        tags = data_tuple[2]  # This is a list

        data.append(
            {
                "name": data_tuple[0],
                "description": data_tuple[1],
                "network": tags[0],
                "dapp": tags[1],
                "url": url,
                "is_defi": "defi" in tags,
                "is_point_system": "points" in tags,
                "is_official": "official x collab" in tags,
                "tags": tags,
            }
        )
df = pd.DataFrame(data)
df.to_csv("processed/data.csv")
df.head()

Unnamed: 0,name,description,network,dapp,url,is_defi,is_point_system,is_official,tags
0,Earn Kelp Miles + Renzo ezPoints + EigenLayer ...,Swap ETH to YT ezETH and YT rsETH on Pendle wi...,arbitrum one,pendle,https://app.bentobatch.com/case/pendle_points,True,False,False,"[arbitrum one, pendle, defi, staking, restaking]"
1,Grab the Most Popular NFT on Zora,Imagine.zora.eth's latest NFT package is here ...,zora,zora,https://app.bentobatch.com/case/zora_most_minted,False,False,False,"[zora, zora, nft]"
2,🔥 Earn extra 20% GenesisLRT restaking points a...,"Restake ETH on GenesisLRT and Lido. Plus, prov...",ethereum,genesis,https://app.bentobatch.com/case/genesis_liquidity,True,False,True,"[ethereum, genesis, official x collab, extra b..."
3,On-chain Interaction on Scroll : Rookie,Potential: ⭐⭐⭐ / Number of Contract Interacted...,scroll,scroll,https://app.bentobatch.com/case/scroll_airdrop...,False,False,False,"[scroll, scroll, dex, swap]"
4,Famous Creator Collections,Trending on Zora! Get Creator's retweeted coll...,zora,zora,https://app.bentobatch.com/case/zora_famous_cr...,False,False,False,"[zora, zora, nft]"


In [6]:
df['text'] = df.apply(
    lambda row: f"{row['name']}, description: {row['description']}, network: {row['network']}, {"is defi" if row['is_defi'] else ""}, {"is point system" if row['is_point_system'] else ""},{"is official" if row['is_official'] else ""},tags: {row['tags'], }" + ". " + row['url'],
    axis=1
)
text = df['text']
text.to_csv("processed/data.csv")

In [7]:
# Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=EMBEDDING_MODEL):
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model)

    return response.data[0].embedding

In [8]:
df["embedding"] = df["text"].apply(lambda x: get_embedding(x))
df.to_csv("processed/embeddings.csv")
df.head()

Unnamed: 0,name,description,network,dapp,url,is_defi,is_point_system,is_official,tags,text,embedding
0,Earn Kelp Miles + Renzo ezPoints + EigenLayer ...,Swap ETH to YT ezETH and YT rsETH on Pendle wi...,arbitrum one,pendle,https://app.bentobatch.com/case/pendle_points,True,False,False,"[arbitrum one, pendle, defi, staking, restaking]",Earn Kelp Miles + Renzo ezPoints + EigenLayer ...,"[0.0023058936931192875, 0.019326893612742424, ..."
1,Grab the Most Popular NFT on Zora,Imagine.zora.eth's latest NFT package is here ...,zora,zora,https://app.bentobatch.com/case/zora_most_minted,False,False,False,"[zora, zora, nft]","Grab the Most Popular NFT on Zora, description...","[0.033888764679431915, 0.019841885194182396, -..."
2,🔥 Earn extra 20% GenesisLRT restaking points a...,"Restake ETH on GenesisLRT and Lido. Plus, prov...",ethereum,genesis,https://app.bentobatch.com/case/genesis_liquidity,True,False,True,"[ethereum, genesis, official x collab, extra b...",🔥 Earn extra 20% GenesisLRT restaking points a...,"[0.004541878588497639, -0.008230915293097496, ..."
3,On-chain Interaction on Scroll : Rookie,Potential: ⭐⭐⭐ / Number of Contract Interacted...,scroll,scroll,https://app.bentobatch.com/case/scroll_airdrop...,False,False,False,"[scroll, scroll, dex, swap]","On-chain Interaction on Scroll : Rookie, descr...","[0.001451416639611125, 0.00879091490060091, -0..."
4,Famous Creator Collections,Trending on Zora! Get Creator's retweeted coll...,zora,zora,https://app.bentobatch.com/case/zora_famous_cr...,False,False,False,"[zora, zora, nft]","Famous Creator Collections, description: Trend...","[0.0646241158246994, 0.0008160494035109878, -0..."


In [9]:
def search_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: cosine_similarity(x, y),
    top_n: int = 100,
):
    # Get the embedding for the query
    query_embedding = get_embedding(query)

    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]

    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [10]:
# examples
# strings, relatednesses = search_functions("AAVE related", df, top_n=5)
# for string, relatedness in zip(strings, relatednesses):
#     print(f"{relatedness=:.3f}")
#     display(string)

# Ask


In [11]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(query: str, df: pd.DataFrame, model: str, token_budget: int) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatedness = search_relatedness(query, df)
    introduction = 'Use the cases on the BentoBatch to answer the subsequent question. If the answer cannot be found in the data, write "I don\'t know."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        data = f'\n\nData:\n"""\n{string}\n"""'
        if num_tokens(message + data + question, model=model) > token_budget:
            break
        else:
            message += data
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {
            "role": "system",
            "content": "You answer questions about the cases on BentoBatch.",
        },
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

### Examples


In [12]:
ask("What cases are related to AAVE?")

'The cases related to AAVE are:\n\n1. [Deposit ETH and borrow ETH on AAVE [Normal-Mode]](https://app.bentobatch.com/case/aave_eth_eth): One click to deposit ETH and borrow 70% equivalent value in ETH.\n2. [Deposit ETH and borrow ETH on AAVE [E-Mode]](https://app.bentobatch.com/case/aave_eth_eth_emode): One click to deposit ETH and borrow 85% equivalent value in ETH.\n3. [Deposit USDC to borrow ETH on AAVE](https://app.bentobatch.com/case/aave_usdc_eth): One click to deposit USDC and borrow 70% equivalent value in ETH.\n4. [Deposit USDT to borrow ETH on AAVE](https://app.bentobatch.com/case/aave_usdt_eth): One click to deposit USDT and borrow 70% equivalent value in ETH.'

In [13]:
ask("What cases are official collaborations?")

'The following cases are official collaborations:\n\n1. 🔥 Earn extra Ether.Fi points and Bridge\n   - https://app.bentobatch.com/case/ether_fi_arbitrum_bridge\n\n2. Restake with Eigenpie & Swell & Zircuit\n   - https://app.bentobatch.com/case/zircuit_eigenpie_swell\n\n3. 🔥 Earn extra 10% Penpad Points and share up to 10000 $BLT prize pool\n   - https://app.bentobatch.com/case/scroll_airdrop_hunting_with_penpad\n\n4. 🔥 Earn extra 15% Zircuit Points with Ethena Restaking by USDC\n   - https://app.bentobatch.com/case/zircuit_ethena_usdc\n\n5. 🔥 Earn extra 20% GenesisLRT restaking points and get x15 Gems boost\n   - https://app.bentobatch.com/case/genesis_liquidity\n\n6. 🔥 Earn extra 15% Zircuit Points with Ethena Restaking by USDT\n   - https://app.bentobatch.com/case/zircuit_ethena_usdt\n\n7. 🔥 Earn extra 15% Zircuit Points with Renzo Restaking by ETH\n   - https://app.bentobatch.com/case/zircuit_renzo\n\n8. 🔥 Earn extra 15% Zircuit Points with Swell Restaking\n   - https://app.bentobatc

In [14]:
ask("I have a plenty of usdc, what can I do with it?")

'You have several options for using your USDC based on the cases on BentoBatch:\n\n1. **Deposit USDC to borrow ETH on AAVE**: You can deposit USDC and borrow 70% equivalent value in ETH on the Ethereum network.\n   - [Link to case](https://app.bentobatch.com/case/aave_usdc_eth)\n\n2. **Earn extra 15% Zircuit Points with Ethena Restaking by USDC**: You can swap USDC to USDe and deposit USDe on Zircuit in one click.\n   - [Link to case](https://app.bentobatch.com/case/zircuit_ethena_usdc)\n\n3. **Yield farming on Yearn with USDC.e**: You can deposit USDC.e to Yearn finance’s V3 Vault on the Polygon network to auto compound and yield high APY%.\n   - [Link to case](https://app.bentobatch.com/case/yearn_v3_usdce)\n\n4. **Yield Farming on Yearn with Gamma USDC.e/WETH LP**: You can provide USDC.e/WETH LP on Gamma and reinvest it on Yearn V3 vaults on the Polygon network.\n   - [Link to case](https://app.bentobatch.com/case/gamma_yearn)\n\nThese options allow you to either borrow ETH, earn ex

In [15]:
ask("What is the point system used for?")

'The point system is used to earn various types of points, such as Zircuit points, EtherFi loyalty points, EigenLayer Points, Kelp Miles, Renzo ezPoints, Blast Points, Blast Gold, $ORBIT, Juice Finance Points, Thruster Points, Penpad Points, GenesisLRT restaking points, and others, by engaging in different DeFi activities like staking, restaking, swapping, lending, and providing liquidity on various platforms and networks.'