In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import pickle

# Extracts details from the metadata associated with a swap ID.
def getSwapDetails(swap_metadata):
    chain = swap_metadata["chain"].iloc[0]
    dex = swap_metadata["dex"].iloc[0]
    token_in = swap_metadata["token_in_symbol"].iloc[0]
    token_out = swap_metadata["token_out_symbol"].iloc[0]
    return chain, dex, token_in, token_out


# Expands a list of swap IDs into a list of dictionaries with the metadata associated
#   with each swap ID.
def swapsIDsToPath(swap_ids):
    details = []
    for id in swap_ids:
        if id == 0:
            continue

        chain, dex, token_in, token_out = getSwapDetails(
            opps_metadata_df.loc[opps_metadata_df["swap_id"] == id]
        )

        details.append(
            {"chain": chain, "dex": dex, "token_in": token_in, "token_out": token_out}
        )

    return details


# Converts a path from a csv string to a list.
def convertStringToList(path):
    return [int(float(k)) for k in path[1:-1].split(",")]

print("Loading path data...")
opps_df = pd.read_csv("assets/opps_cleaned-2023-3-19.csv").reset_index(drop=True)
opps_metadata_df = pd.read_csv("assets/opps_metadata-2023-3-19.csv").reset_index(drop=True)
print("Finished loading path data!")

print("Searching for unique paths in `opps_df` archive...")
unique_paths = opps_df[
    ["swap_1_id", "swap_2_id", "swap_3_id", "swap_4_id"]
].drop_duplicates()
unique_paths.reset_index(inplace=True)
# print(unique_paths)
print("Finsihed searching for unique paths in `opps_df` archive!")

print(
    "Collect statistics about which chains, DEXs, and tokens are involved in the opportunities."
)
stats = {}
for _, key in unique_paths.iterrows():
    # A `swap_id` corresponds to a unique pair contract. It is unique to a specific
    #   chain, DEX, and token pair.
    for swap_id in [
        key["swap_1_id"],
        key["swap_2_id"],
        key["swap_3_id"],
        key["swap_4_id"],
    ]:
        if swap_id == 0.0:
            continue

        metadata = opps_metadata_df[opps_metadata_df["swap_id"] == swap_id]
        chain = metadata["chain"].iloc[0]

        if chain not in stats.keys():
            stats[chain] = {"dexes": [], "tokens": []}

        dex = metadata["dex"].iloc[0]
        if dex not in stats[chain]["tokens"]:
            stats[chain]["tokens"].append(metadata["dex"].iloc[0])

        token = metadata["token_in_symbol"].iloc[0]
        if token not in stats[chain]["dexes"]:
            stats[chain]["dexes"].append(metadata["token_in_symbol"].iloc[0])

# A token pair is often involved in multiple opportunities for a given block.
#   Ignore "inferior" opportunities that occur at the same timestamp as better ones.

# Get unique timestamps from `opps_df` archive.
unique_timestamps = opps_df["timestamp"].unique()

# Track the valid opportunities at each unique timestamp.
print("Finding valid opportunities")
valid_opps = {}
for i, timestamp in enumerate(unique_timestamps):
    if np.isnan(timestamp):
        # This timestamp is invalid.
        # print("found nan!")
        continue

    # Get opportunities that happen at the same time. Sort them by profit.
    opps_simultaneous = opps_df[opps_df["timestamp"] == timestamp].sort_values(
        by="profit_usd", ascending=False
    )

    valid_opps[timestamp] = []

    # Track the swap IDs used at this timestamp. Only keep the opportunities that don't have
    #   conflicting swap IDs.
    swap_ids_used = set()
    for j, opp in opps_simultaneous.iterrows():
        valid = 1
        swap_ids = opp[["swap_1_id", "swap_2_id", "swap_3_id", "swap_4_id"]].to_list()
        for id in swap_ids:
            # `swap_3_id` and `swap_4_id` could be 0 if not used. No need to track those.
            if id == 0:
                break
            # This ID is not used yet, so it is valid.
            elif id not in swap_ids_used:
                continue
            # This ID is used already, and it is conflicting with a greater opportunity.
            else:
                # No need to look further because this opportunity is inferior.
                valid = 0
                break

        # Add the swap ID to the set of used ones.
        if valid:
            swap_ids_used = swap_ids_used.union(set(swap_ids))
            valid_opps[timestamp].append(opp)
        else:
            pass


# Block rate (in seconds) for each chain that we track.
block_rate = {
    'polygon':2,
    'ethereum':12,
    'bsc':3,
    'fantom':3,
    'avalanche':1,
    'gnosis':5,
    'arbitrum':1,
    'optimism':2
}

# Track the timestamps, profits, and block numbers where each opportunity occurs. 
path_data = {}
for timestamp, opps in valid_opps.items():
    for opp in opps: 
        path = str(opp[['swap_1_id', 'swap_2_id', 'swap_3_id', 'swap_4_id']].to_list())

        # Maps chains to the block numbers for the current opportunity
        block_numbers = opp[['swap_1_block', 'swap_2_block', 'swap_3_block', 'swap_4_block']].tolist()
        
        if path not in path_data:
            path_data[path] = {
                'timestamps': [timestamp], 
                'profit_usd': [opp['profit_usd']],
                'block_numbers': [block_numbers]
            }
        else:
            path_data[path]['timestamps'].append(timestamp)
            path_data[path]['profit_usd'].append(opp['profit_usd'])
            path_data[path]['block_numbers'].append(block_numbers)

# Sort the `path_data` by timestamps. Basically gets us a time series version of the data.
for path, data in path_data.items():
    timestamps = data['timestamps']
    profits = data['profit_usd']
    path_data[path]['timestamps'], path_data[path]['profits_usd'] = (
        list(t) for t in zip(*sorted(zip(timestamps, profits)))
    )

# Stores opportunity clusters, which are clustered by relative profit and time.
unique_opps_profit_time = {}

# Loop through each path and assign its opportunities to a cluster.
for path, data in path_data.items():

    # Initialize the clustering dictionary for this path.
    unique_opps_profit_time[path] = {
        'profits':[],
        'timestamps': []
    }

    # Get the swap IDs used in the path.
    swap_ids = convertStringToList(path) 
    
    # Find the slowest chain in the path.
    slowest_block = 0
    for id in swap_ids:
        if id == 0:
            break

        # Get the chain name corresponding to the swap ID: Ethereum, Polygon, etc.
        chain_name = opps_metadata_df[opps_metadata_df['swap_id'] == id]['chain'].iloc[0]

        # Check if the current chain is the slowest in the path.
        if block_rate[chain_name] > slowest_block:
            slowest_block = block_rate[chain_name]
            slowest_chain = chain_name

    # Find all the opportunities and how long they persist when clustered.
    timestamps = data['timestamps']
    profits = data['profit_usd']
    profit_reference = profits[0] 
    time_reference = timestamps[0]

    # Variables used to track the current opportunity cluster.
    cluster_profits = [profits[0]]
    cluster_timestamps = [timestamps[0]]

    # Assign all opportunities in this path to a cluster.
    for i in range(1, len(timestamps)):
        current_profit = profits[i]
        current_time = timestamps[i]
        
        # If this opportunity is too different from the last opportunity in the current cluster, 
        #  store the current cluster and start a new one.

        # Current opportunity can be at most 10% different from the last opportunity in the cluster.
        propotion = current_profit / profit_reference
        max_deviation = .1

        # Current opportunity can be at most 100 seconds later than the last opportunity in the cluster.
        delta = current_time - time_reference 
        max_delta = 100

        # The following `if` logic runs whenever we need to store a cluster
        if propotion < (1 - max_deviation) or propotion > (1 + max_deviation) or delta > max_delta: 
            # Store the details of the current opportunity
            unique_opps_profit_time[path]['profits'].append(cluster_profits)
            unique_opps_profit_time[path]['timestamps'].append(cluster_timestamps)

            # Reset variables used to track next opportunity
            cluster_profits = [profits[i]]
            cluster_timestamps = [timestamps[i]]
        # The following logic runs when we assign an opportunity to an existing cluster
        else:
            cluster_profits.append(profits[i])
            cluster_timestamps.append(timestamps[i])
        
        # Reset the reference points
        profit_reference = current_profit
        time_reference = current_time

# Calculate more metadata for each opportunity cluster
for key, metadata in unique_opps_profit_time.items():
    max_profit_sum = 0
    min_profit_sum = 0
    opportunity_duration_sum = 0
    for timestamps, profits in zip(metadata['timestamps'], metadata['profits']):
        max_profit_sum += max(profits) 
        min_profit_sum += min(profits) 
        opportunity_duration_sum += len(timestamps)

    unique_opps_profit_time[key]['max_profit_sum'] = max_profit_sum
    unique_opps_profit_time[key]['min_profit_sum'] = min_profit_sum
    unique_opps_profit_time[key]['num_opportunities'] = len(metadata['profits'])
    unique_opps_profit_time[key]['opportunity_duration_sum'] = opportunity_duration_sum

    swap_ids = convertStringToList(key)
    unique_opps_profit_time[key]['path'] = swapsIDsToPath(swap_ids)
    
# Save all the variables to a binary file.
print("Saving variables...")
with open('assets/opps.pkl', 'wb') as f:
    pickle.dump(opps_df, f)

with open('assets/opps_metadata_df.pkl', 'wb') as f:
    pickle.dump(opps_metadata_df, f)

with open('assets/path_data.pkl', 'wb') as f:
    pickle.dump(path_data, f)

with open('assets/unique_opps_profit_time.pkl', 'wb') as f:
    pickle.dump(unique_opps_profit_time, f)

with open('assets/unique_paths.pkl', 'wb') as f:
    pickle.dump(unique_paths, f)

with open('assets/unique_timestamps.pkl', 'wb') as f:
    pickle.dump(unique_timestamps, f)

with open('assets/block_rate.pkl', 'wb') as f:
    pickle.dump(block_rate, f)

with open('assets/stats.pkl', 'wb') as f:
    pickle.dump(stats, f)


Loading path data...
Finished loading path data!
Searching for unique paths in `opps_df` archive...
Finsihed searching for unique paths in `opps_df` archive!
Collect statistics about which chains, DEXs, and tokens are involved in the opportunities.
Finding valid opportunities
Saving variables...
