In [1]:
# import libraries
import os
import re
import requests
import tiktoken  # for counting tokens
import pandas as pd  # for storing text and embeddings data
import urllib.request  # for downloading html
from bs4 import BeautifulSoup  # for parsing html
from collections import deque
from html.parser import HTMLParser
from openai import OpenAI  # for calling the OpenAI API
from tenacity import (
    retry,
    wait_random_exponential,
    stop_after_attempt,
)  # for retrying API calls
from urllib.parse import urlparse
from utils.embeddings_utils import cosine_similarity

# models
EMBEDDING_MODEL = "text-embedding-3-small"
GPT_MODEL = "gpt-4o"

In [2]:
# constants
domain = "app.bentobatch.com"
full_url = "https://app.bentobatch.com/"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Crawl Web Pages

Given a list of URLs, crawl each page and return the content of the page.


In [3]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r"^http[s]*://.+"


# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])


# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    user_agent = "Mozilla/5.0"
    headers = {"User-Agent": user_agent}

    # Try to open the URL and read the HTML
    try:
        request = urllib.request.Request(url=url, headers=headers)
        # Open the URL and read the HTML
        with urllib.request.urlopen(request) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get("Content-Type").startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode("utf-8")
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks


# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
        os.mkdir("text/")

    if not os.path.exists("text/" + local_domain + "/"):
        os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
        os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url)  # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open(
            "text/" + local_domain + "/" + url[8:].replace("/", "_") + ".txt", "w"
        ) as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the content from the URL
            text = soup.get_text()

            # Write the data to the file
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

In [None]:
crawl(full_url)

# Data Cleaning


In [4]:
def remove_newlines(serie):
    """Remove newlines from a pandas series for better processing."""
    serie = serie.str.replace("\n", " ")
    serie = serie.str.replace("\\n", " ")
    serie = serie.str.replace("  ", " ")
    serie = serie.str.replace("  ", " ")
    return serie

In [5]:
# Create a list to store the data records
data = []

# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):

    # Open the file and read the text
    with open("text/" + domain + "/" + file, "r") as f:
        text = f.read()

        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
        data.append(
            (
                file[len(domain) + 6 : -4]
                .replace("-", " ")
                .replace("_", " ")
                .replace("#update", ""),
                text,
            )
        )
# Create a dataframe from the list of texts
df = pd.DataFrame(data, columns=["fname", "text"])
df["text"] = df.fname + ". " + remove_newlines(df.text)

if not os.path.exists(f"processed/{domain}/"):
    os.mkdir(f"processed/{domain}/")
df.to_csv(f"processed/{domain}/data.csv")

In [6]:
# Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=EMBEDDING_MODEL):
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model)

    return response.data[0].embedding

In [7]:
df["embedding"] = df["text"].apply(lambda x: get_embedding(x))
if not os.path.exists(f"processed/{domain}/"):
    os.mkdir(f"processed/{domain}/")
df.to_csv(f"processed/{domain}/embeddings.csv")
df.head()

Unnamed: 0,fname,text,embedding
0,pendle points,pendle points. Bento Batch | One Click. All se...,"[-0.0006369592738337815, -0.007568110711872578..."
1,zora most minted,zora most minted. Bento Batch | One Click. All...,"[-0.025355955585837364, -0.010841479524970055,..."
2,genesis liquidity,genesis liquidity. Bento Batch | One Click. Al...,"[-0.0002950801863335073, 0.00394883519038558, ..."
3,scroll airdrop hunting rookie,scroll airdrop hunting rookie. Bento Batch | O...,"[-0.020456600934267044, -0.012506709434092045,..."
4,zora famous creators,zora famous creators. Bento Batch | One Click....,"[-0.020248455926775932, -0.013573813252151012,..."


In [8]:
def search_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: cosine_similarity(x, y),
    top_n: int = 100,
):
    # Get the embedding for the query
    query_embedding = get_embedding(query)

    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]

    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [10]:
# examples
strings, relatednesses = search_relatedness("AAVE related", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.784


"aave eth eth. Bento Batch | One Click. All set.Trading StrategiesDashboardConnect WalletTrading StrategiesDashboard© 2024 Bento BatchLeave us a feedbackTrading StrategiesBento Batch 🍱Deposit ETH and borrow ETH on AAVE [Normal-Mode]EthereumAAVEDefiLendingOne click to deposit ETH and borrow 70% equivalent value in ETH.Show more Batch details01SupplytoascollateralInteract with 0x893411...A8c7b2b902DisableE-ModeifitisenabledInteract with 0x87870B...0B4fA4E203DelegateborrowingpoweronthedebtWETHifneededInteract with 0xeA51d7...88Be0fFE04BorrowfromInteract with 0x893411...A8c7b2b9ETH AmountSupply to AAVE as collateralBalance MaxBorrow PercentagePercentage of the supplied ETH to be borrowed as ETH (Max: 70%)Refer to get 10% of your friend's BENTO BOXRefer to get 10% of your friend's BENTO BOX© 2024 Bento BatchLeave us a feedback"

relatedness=0.782


"aave eth eth emode. Bento Batch | One Click. All set.Trading StrategiesDashboardConnect WalletTrading StrategiesDashboard© 2024 Bento BatchLeave us a feedbackTrading StrategiesBento Batch 🍱Deposit ETH and borrow ETH on AAVE [E-Mode]EthereumAAVEDefiLendingOne click to deposit ETH and borrow 85% equivalent value in ETH.Show more Batch details01SupplytoascollateralInteract with 0x893411...A8c7b2b902EnableE-ModeifitisdisabledInteract with 0x87870B...0B4fA4E203DelegateborrowingpoweronthedebtWETHifneededInteract with 0xeA51d7...88Be0fFE04BorrowfromInteract with 0x893411...A8c7b2b9ETH AmountSupply to AAVE as collateralBalance MaxBorrow PercentagePercentage of the supplied ETH to be borrowed as ETH (Max: 85%)Refer to get 10% of your friend's BENTO BOXRefer to get 10% of your friend's BENTO BOX© 2024 Bento BatchLeave us a feedback"

relatedness=0.781


"aave usdt eth. Bento Batch | One Click. All set.Trading StrategiesDashboardConnect WalletTrading StrategiesDashboard© 2024 Bento BatchLeave us a feedbackTrading StrategiesBento Batch 🍱Deposit USDT to borrow ETH on AAVEEthereumAAVEDefiLendingOne click to deposit USDT and borrow 70% equivalent value in ETH.Show more Batch details01ApprovetoaccessInteract with 0xdAC17F...3D831ec702SupplytoascollateralInteract with 0x87870B...0B4fA4E203DisableE-ModeifitisenabledInteract with 0x87870B...0B4fA4E204DelegateborrowingpoweronthedebtWETHifneededInteract with 0xeA51d7...88Be0fFE05BorrowfromInteract with 0x893411...A8c7b2b9USDT AmountSupply to AAVE as collateralBalance MaxBorrow PercentagePercentage of the supplied USDT to be borrowed as ETH (Max: 70%)Refer to get 10% of your friend's BENTO BOXRefer to get 10% of your friend's BENTO BOX© 2024 Bento BatchLeave us a feedback"

relatedness=0.778


"aave usdc eth. Bento Batch | One Click. All set.Trading StrategiesDashboardConnect WalletTrading StrategiesDashboard© 2024 Bento BatchLeave us a feedbackTrading StrategiesBento Batch 🍱Deposit USDC to borrow ETH on AAVEEthereumAAVEDefiLendingOne click to deposit USDC and borrow 70% equivalent value in ETH.Show more Batch details01ApprovetoaccessInteract with 0xA0b869...3606eB4802SupplytoascollateralInteract with 0x87870B...0B4fA4E203DisableE-ModeifitisenabledInteract with 0x87870B...0B4fA4E204DelegateborrowingpoweronthedebtWETHifneededInteract with 0xeA51d7...88Be0fFE05BorrowfromInteract with 0x893411...A8c7b2b9USDC AmountSupply to AAVE as collateralBalance MaxBorrow PercentagePercentage of the supplied USDC to be borrowed as ETH (Max: 70%)Refer to get 10% of your friend's BENTO BOXRefer to get 10% of your friend's BENTO BOX© 2024 Bento BatchLeave us a feedback"

relatedness=0.766


'. '

# Ask


In [11]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(query: str, df: pd.DataFrame, model: str, token_budget: int) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatedness = search_relatedness(query, df)
    introduction = 'You are the customer service of BentoBatch. Use the provided data to answer the subsequent question. If the answer cannot be found in the data, simply state "I do not have that information."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        data = f'\n\nData:\n"""\n{string}\n"""'
        if num_tokens(message + data + question, model=model) > token_budget:
            break
        else:
            message += data
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {
            "role": "system",
            "content": "You answer questions regarding BentoBatch.",
        },
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

### Examples


In [12]:
ask("What is BentoBatch?")

'Bento Batch is a Streamlined Transaction Layer (STL) that leverages Account Abstraction techniques to upscale blockchain efficiency. It provides an easier, cheaper, and safer on-chain experience by consolidating multiple on-chain interactions into a single transaction, eliminating the need for multiple signs and confirmations.'

In [13]:
ask("What is Bento Box?")

'I do not have that information.'

In [14]:
ask("Why do I need BentoBatch if I can interact with the dApps directly?")

"BentoBatch offers several advantages over interacting with dApps directly:\n\n1. **Streamlined Transactions**: With BentoBatch's Streamlined Transaction Layer (STL), you can consolidate multiple on-chain interactions into a single transaction. This eliminates the need for multiple signs and confirmations, making the process more efficient and user-friendly.\n\n2. **Efficiency and Cost Savings**: By bundling transactions, BentoBatch can help reduce gas fees and save time. This is particularly beneficial when dealing with high gas fees or complex interactions that would otherwise require multiple steps.\n\n3. **Enhanced User Experience**: Users do not need to grasp every transaction detail or sign each transaction individually. Instead, they simply define their objectives, and the STL facilitates their needs efficiently.\n\n4. **Rewards and Incentives**: Executing Batches can earn you Bento Boxes, which come with additional rewards. You can also refer friends and receive an additional 1

In [15]:
ask("What cases are related to AAVE?")

'The cases related to AAVE are:\n\n1. **Deposit USDC to borrow ETH on AAVE**\n   - Deposit USDC and borrow 70% equivalent value in ETH.\n\n2. **Deposit ETH and borrow ETH on AAVE [Normal-Mode]**\n   - Deposit ETH and borrow 70% equivalent value in ETH.\n\n3. **Deposit ETH and borrow ETH on AAVE [E-Mode]**\n   - Deposit ETH and borrow 85% equivalent value in ETH.\n\n4. **Deposit USDT to borrow ETH on AAVE**\n   - Deposit USDT and borrow 70% equivalent value in ETH.'

In [16]:
ask("What cases are official collaborations?")

'I do not have that information.'

In [17]:
ask("I have a plenty of usdc, what can I do with it?")

'You can use your USDC in the following ways with BentoBatch:\n\n1. **Deposit USDC to borrow ETH on AAVE**:\n   - One click to deposit USDC and borrow 70% equivalent value in ETH.\n   - Steps involved:\n     1. Approve to access.\n     2. Supply to AAVE as collateral.\n     3. Disable E-Mode if it is enabled.\n     4. Delegate borrowing power on the debt WETH if needed.\n     5. Borrow from AAVE.\n\n2. **Earn extra 15% Zircuit Points with Ethena Restaking by USDC**:\n   - Swap USDC to USDe and deposit USDe on Zircuit in one click.\n   - Steps involved:\n     1. Approve for Curve.\n     2. Swap to USDe.\n     3. Approve USDe for Zircuit.\n     4. Stake USDe to Zircuit.\n\n3. **Yield farming on Yearn with USDC.e**:\n   - Deposit USDC.e to Yearn finance’s V3 Vault. Auto compound and yield high APY% with USDC.e.\n   - Steps involved:\n     1. Approve USDC.e to Yearn V3.\n     2. Deposit USDC.e to Yearn V3.\n\nThese options allow you to leverage your USDC for borrowing, staking, and yield f

In [18]:
ask("Is it safe to use BentoBatch?")

'I do not have that information.'

In [19]:
ask("How do you think this project should develop its tokenomics?")

'I do not have that information.'