# Environment


In [1]:
# import libraries
import os
import pandas as pd  # for storing text and embeddings data
from openai import OpenAI  # for calling the OpenAI API
from openai.types.chat import ChatCompletionMessageParam
from tenacity import (
    retry,
    wait_random_exponential,
    stop_after_attempt,
)  # for retrying API calls
import tiktoken  # for counting tokens
from utils.embeddings_utils import cosine_similarity
from utils.data_utils import remove_newlines, extract_preview_tx

# models
EMBEDDING_MODEL = "text-embedding-3-large"
GPT_MODEL = "gpt-3.5-turbo"

In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Data Cleaning


In [3]:
import re


def parse_directory(base_dir):
    data = []
    pattern = re.compile(r"[a-zA-Z0-9]+_[a-zA-Z0-9\-]+$")

    for root, dirs, files in os.walk(base_dir):
        for dir_name in dirs:
            if not pattern.match(dir_name):
                continue  # Skip directories not matching the pattern. e.g. ___prebuilt-tx
            print(f"Processing {dir_name}")
            dir_path = os.path.join(root, dir_name)
            network, case = dir_name.split("_")
            index_ts_path = os.path.join(dir_path, "index.ts")
            index_details_ts_path = os.path.join(dir_path, "index.details.ts")

            index_data = ""
            details_data = ""

            if os.path.exists(index_ts_path):
                with open(index_ts_path, "r") as index_file:
                    index_data = index_file.read()

            if os.path.exists(index_details_ts_path):
                with open(index_details_ts_path, "r") as details_file:
                    details_data = details_file.read()

            data.append(
                {
                    "case": case.replace("-", "_"),
                    "network": network,
                    "code": index_data,
                    "text": f"""
                            {case} === 
                            {network} === 
                            {index_data} ===
                            {details_data} ===
                        """,
                }
            )

    return data


directory_path = "case/"
data = parse_directory(directory_path)
df = pd.DataFrame(data)
df.to_csv("processed/case.csv")
print(f"Data shape: {df.shape}")

Processing polygon_multi-sender
Extracted previewTx code:

    {
      name: 'Transfer',
      description: 'USDT to recipient',
      to: tokenConfig.addr,
    },
    {
      name: 'Transfer',
      description: 'USDT to recipient',
      to: tokenConfig.addr,
    },
    {
      name: 'Transfer',
      description: 'USDT to recipient',
      to: tokenConfig.addr,
    },
  
Processing ethereum_zircuit-eigenpie-swell
Extracted previewTx code:

    {
      name: 'Stake',
      description: `ETH and receive swETH on Swell`,
      to: swETHProxyAddr,
      meta: {
        highlights: ['Swell'
Processing ethereum_genesis-liquidity
Extracted previewTx code:

    {
      name: 'Stake',
      description: 'ETH and receive genETH on Genesis',
      to: genesisRestakingPoolAddr,
      meta: {
        highlights: ['Genesis'
Processing polygon_yearn-v3-usdt
Extracted previewTx code:

      {
        name: "Approve",
        description: `${tokenName} to Yearn V3`,
        to: tokenContract,
      

In [None]:
# remove newlines
df["text"] = remove_newlines(df["text"])
if not os.path.exists("processed/"):
    os.mkdir("processed/")
df.to_csv(f"processed/data.csv")
df.head()

# Create embeddings


In [None]:
# Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=EMBEDDING_MODEL) -> list:
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [None]:
df["text_embedding"] = df["text"].apply(lambda x: get_embedding(x))
df.to_csv(f"processed/embeddings.csv")

In [None]:
def search_case(df, query, top_n=3, n_lines=10):
    query_embedding = get_embedding(query)

    cases_and_relatednesses = [
        (
            row["case"],
            row["network"],
            row["code"],
            cosine_similarity(query_embedding, row["text_embedding"]),
        )
        for i, row in df.iterrows()
    ]
    cases_and_relatednesses.sort(key=lambda x: x[3], reverse=True)
    top_cases = cases_and_relatednesses[:top_n]
    cases, networks, codes, scores = zip(*top_cases)
    return list(zip(scores, cases, networks, codes))

In [None]:
# Example
result = search_case(df, "Deposit USDC to borrow ETH on AAVE", top_n=3)
for score, case, network, code in result:
    print(f"Score: {score.round(4)}")
    print(f"Case: {case}")
    print(f"Network: {network}")
    print(f"Code: {code[:20]}...")
    print("-" * 70)  # separator

# Ask


In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """
    Return the number of tokens in a string.
    """
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(query: str, df: pd.DataFrame, model: str, token_budget: int) -> str:
    """
    Return a message for GPT, with relevant source texts pulled from a dataframe.
    """
    result = search_case(df, query)
    scores, cases, networks, codes = zip(*result)
    introduction = "You are an assistant of BentoBatch. You are asked to provide a piece of code by querying the case."
    question = f"\n\nQuestion: {query}"
    message = introduction
    for code in codes:
        data = f'\n\nCode:\n"""\n{code}\n"""'
        if num_tokens(message + data + question, model=model) > token_budget:
            break
        else:
            message += data
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
):
    """
    Answers a query using GPT and a dataframe of relevant texts and embeddings.
    """
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages: list[ChatCompletionMessageParam] = [
        {
            "role": "system",
            "content": "You show the code of the case to the user.",
        },
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [None]:
ask("Deposit USDC to borrow ETH on AAVE")