In [None]:
import os
import csv
import json
import pandas as pd
from openai import OpenAI
from utils.embeddings_utils import (
    cosine_similarity,
    get_embedding,
)

EMBEDDING_MODEL = "text-embedding-3-small"
# EMBEDDING_MODEL = "text-embedding-ada-002"

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [None]:
def json_to_csv(json_file_path, csv_file_path):
    with open(json_file_path, "r") as json_file:
        data = json.load(json_file)

    with open(csv_file_path, "w", newline="") as csv_file:
        csv_writer = csv.writer(csv_file)

        # Write the header row
        header = list(data[0].keys()) + ["text"]
        csv_writer.writerow(header)

        # Write the data rows
        for item in data:
            text = ", ".join(str(value) for value in item.values())
            row = list(item.values()) + [text]
            csv_writer.writerow(row)


json_to_csv("data/protocol.json", "processed/embeddings/protocol.csv")
df = pd.read_csv("processed/embeddings/protocol.csv")

In [None]:
def remove_newlines(serie):
    """
    Remove newlines from a pandas series for better processing.
    """
    serie = serie.str.replace("\n", " ")
    serie = serie.str.replace("\\n", " ")
    serie = serie.str.replace("  ", " ")
    serie = serie.str.replace("  ", " ")
    return serie


df["text"] = remove_newlines(df["text"])
df.to_csv("processed/embeddings/protocol.csv")

In [None]:
df["embedding"] = df["text"].apply(
    lambda x: get_embedding(text=x, model=EMBEDDING_MODEL)
)

df.to_csv("processed/embeddings/protocol.csv")
df.head()

## Find relevant embeddings


In [None]:
def search(df, query, top_n=3):
    query_embedding = get_embedding(text=query, model=EMBEDDING_MODEL)

    def calculate_similarity(row):
        return (
            cosine_similarity(query_embedding, row["embedding"]),
            row["id"],
            row["address"],
        )

    protocol_to_score_list = [calculate_similarity(row) for _, row in df.iterrows()]
    protocol_to_score_list.sort(key=lambda x: x[0], reverse=True)
    top_cases = protocol_to_score_list[:top_n]
    return top_cases


def print_search_results_pretty(result):
    for score, id, address in result:
        print(f"Score: {score.round(4)}")
        print(f"ID: {id}")
        print(f"Contract Address: {address}")
        print("-" * 70)  # separator

#### Examples


In [None]:
result = search(df, "usdc, yearn")
print_search_results_pretty(result)

In [None]:
result = search(df, "usdt, yearn")
print_search_results_pretty(result)

In [None]:
result = search(df, "dai, yearn v3")
print_search_results_pretty(result)

In [None]:
result = search(df, "dai, ethereum")
print_search_results_pretty(result)

In [1]:
from utils.protocol_searcher import ProtocolSearcher

searcher = ProtocolSearcher()
result = searcher.search_protocol("usdc, yearn", "polygon")
print(result)

{'suggested': {'score': 0.7062171595431709, 'id': 'yearn_polygon_usdc_vault', 'address': '0xA013Fbd4b711f9ded6fB09C1c0d358E2FbC2EAA0'}, 'other_options': [{'score': 0.623891719812112, 'id': 'yearn_polygon_usdt_vault', 'address': '0x84E13785B5a27879921D6Ff77f8f6687E9d5b2f7'}, {'score': 0.6059552624183242, 'id': 'yearn_polygon_dai_vault', 'address': '0x90b2f54C6aDDAD41b8f6c4fCCd555197BC0F773B'}, {'score': 0.5750262632156996, 'id': 'yearn_polygon_weth_vault', 'address': '0x305F25377d0a39091e99B975558b1bdfC3975654'}]}
