In [8]:
import os
import re
import pandas as pd
from datetime import datetime

OUTPUT_FILE = 'C:/Users/YuweiCao/Documents/GitHub/Project/Project/etherscan/result'
api_key = "VQAIR728IM4Z8RZKPYBR4ESM5I3WBZK2C1" # my free API key, you can get one at https://etherscan.io/myapikey
base_url = "https://api.etherscan.io/v2/api" # We're using the v2 API 2024/12/12
ADDRESS = "0x5be9a4959308A0D0c7bC0870E319314d8D957dBB" # Address of the contract we want to get the source code of

In [9]:
def highlight_three_records(grouped_df, data):
    matched_hashes = []

    for tx_hash, group in grouped_df:
        if len(group) == 3:
            print(f"⚠️ High Alert: Transaction Hash {tx_hash} contains 3 records:")
            print(group)
            matched_hashes.append(tx_hash)

    # i want to delete the abnormal data
    remaining_data = data[~data['hash'].isin(matched_hashes)].reset_index(drop=True)
    return remaining_data


def format_number(value):
    if value < 1_000:
        return f"{int(value)}"
    elif value < 1_000_000:
        return f"{value / 1_000:.1f}K"
    elif value < 1_000_000_000:
        return f"{value / 1_000_000:.1f}M"
    else:
        return f"{value / 1_000_000_000:.1f}B"


In [10]:
global BASE_TOKENS
BASE_TOKENS = {"USDT", "USDC", "USDE"}
ADDRESS = ADDRESS.lower()

csv_file = 'C:/Users/YuweiCao/Documents/GitHub/Project/Project/etherscan/result/erc20_transfers.csv'
transaction_data = pd.read_csv(csv_file)
# print(transaction_data.head())
transaction_data['dateTime'] = pd.to_datetime(transaction_data['dateTime'])
transaction_data = transaction_data.sort_values(by=['dateTime', 'hash']).reset_index(drop=True)
# make sure the date data is correct

columns_to_keep = [
    'dateTime', 'blockNumber', 'timeStamp', 'hash', 'from', 'to',
    'value', 'tokenName', 'tokenSymbol'
]

duplicate_hashes = transaction_data[transaction_data.duplicated(subset=['hash'], keep=False)]
transaction_data_1 = transaction_data[~transaction_data['hash'].isin(duplicate_hashes['hash'])].reset_index(drop=True)
filtered_transaction_data = transaction_data_1[columns_to_keep]
output_file = 'filtered_transaction_data.csv'
filtered_transaction_data.to_csv(output_file, index=False)
# just a check, if there are abnoraml data, print it and delete from original data
three_record_hashes = highlight_three_records(duplicate_hashes.groupby('hash'), transaction_data)

duplicate_hashes = duplicate_hashes[~duplicate_hashes['hash'].isin(three_record_hashes)]

output_records = []

for hash_val, group in duplicate_hashes.groupby('hash'):
    base_tokens = group[group['tokenSymbol'].isin(BASE_TOKENS)]
    other_tokens = group[~group['tokenSymbol'].isin(BASE_TOKENS)]
    
    if not base_tokens.empty and not other_tokens.empty:
        if base_tokens['to'].iloc[0] == ADDRESS:
            transaction_type = "SELL"
        else:
            transaction_type = "BUY"

        base_token_info = f"{base_tokens['value'].sum()} {base_tokens['tokenSymbol'].iloc[0]}"
        other_token_info = f"{other_tokens['value'].sum()} {other_tokens['tokenSymbol'].iloc[0]}"

        record = f"{group['timeStamp'].iloc[0]} W {transaction_type} {other_token_info} of {base_token_info} (at {group['dateTime'].iloc[0]})"
        output_records.append({
            "formatted_record": record,
            "timeStamp": group['timeStamp'].iloc[0],
            "dateTime": group['dateTime'].iloc[0]
        })
    elif base_tokens.empty and not other_tokens.empty:
        # 如果 tokenSymbol 都不在 BASETOKENS 中，额外输出一下
        record = f"{group['timeStamp'].iloc[0]} Same Hash {hash_val}: Tokens not in BASE_TOKENS: {', '.join(group['tokenSymbol'].unique())} (at {group['dateTime'].iloc[0]})"
        output_records.append({
            "formatted_record": record,
            "timeStamp": group['timeStamp'].iloc[0],
            "dateTime": group['dateTime'].iloc[0]
        })
        
output_df = pd.DataFrame(output_records)
if not output_df.empty:
    output_df = output_df.sort_values(by='timeStamp').reset_index(drop=True)

    print("\nFormatted Transactions:")
    for record in output_df['formatted_record']:
        print(record)


In [None]:
matched_records = []
matched_indices = []

skip_next = False
for i in range(len(transaction_data_1) - 1):
    if skip_next:
        skip_next = False
        continue

    current_row = transaction_data_1.iloc[i]
    next_row = transaction_data_1.iloc[i + 1]

    if ((current_row['to'] == ADDRESS and next_row['from'] == ADDRESS) or
        (current_row['from'] == ADDRESS and next_row['to'] == ADDRESS)):
        
        if ((current_row['tokenSymbol'] in BASE_TOKENS or next_row['tokenSymbol'] in BASE_TOKENS) and
        not (current_row['tokenSymbol'] in BASE_TOKENS and next_row['tokenSymbol'] in BASE_TOKENS)):
            
            if current_row['to'] == ADDRESS:
                transaction_type = "\'SELL\'"
                base_token = current_row
                other_token = next_row
            else:
                transaction_type = "\'BUY\'"
                base_token = next_row
                other_token = current_row

            base_token_info = f"{base_token['value']} {base_token['tokenSymbol']}"
            other_token_info = f"{other_token['value']} {other_token['tokenSymbol']}"

            record = (
                f"{current_row['timeStamp']} W {transaction_type} {other_token_info} of {base_token_info} "
                f"(at {current_row['dateTime']})"
            )
            
            matched_records.append({"formatted_record": record, "dateTime": current_row['dateTime']})
            matched_indices.extend([i, i + 1])

            skip_next = True

# delete the matched data for later processing
transaction_data_2 = transaction_data_1.drop(index=matched_indices).reset_index(drop=True)

matched_df = pd.DataFrame(matched_records)

# combine the first two dataframes and sort by time
combined_df = pd.concat([output_df, matched_df], ignore_index=True)

if not combined_df.empty:
    combined_df = combined_df.sort_values(by='dateTime').reset_index(drop=True)
    for record in combined_df['formatted_record']:
        print(record)

0x5be9a4959308a0d0c7bc0870e319314d8d957dbb 0xe217e15b3c19cc0427f9492dc3bcfe8220afad10 0x9008d19f58aabd9ed0d60971565aa8510560ab41 0x5be9a4959308a0d0c7bc0870e319314d8d957dbb
0x9008d19f58aabd9ed0d60971565aa8510560ab41 0x5be9a4959308a0d0c7bc0870e319314d8d957dbb 0x5be9a4959308a0d0c7bc0870e319314d8d957dbb 0x78ff9211317620de95602c9cbed3ae803689e545
1733940791 W 'BUY' 2187457492438 USDC of 10000000000000 OPTIMUS (at 2024-12-11 18:13:11)


In [12]:
single_records = []

for i, row in transaction_data_2.iterrows():
    if row['tokenSymbol'] in BASE_TOKENS:
        if row['from'] == ADDRESS:
            transaction_type = "single SELL"
        elif row['to'] == ADDRESS:
            transaction_type = "single BUY"
    else:
        if row['from'] == ADDRESS:
            transaction_type = "single BUY"
        elif row['to'] == ADDRESS:
            transaction_type = "single SELL"

    record = f"{row['timeStamp']} W {transaction_type} {row['value']} {row['tokenSymbol']} (at {row['dateTime']})"
    single_records.append({
        "formatted_record": record,
        "dateTime": row['dateTime'],
        "timeStamp": row['timeStamp'],
        "hash": row['hash']
    })

single_df = pd.DataFrame(single_records)

final_combined_df = pd.concat([output_df, matched_df, single_df], ignore_index=True)
if not final_combined_df.empty:
    final_combined_df = final_combined_df.sort_values(by='dateTime').reset_index(drop=True)
    for record in final_combined_df['formatted_record']:
        print(record)


1733794511 W single BUY 25541000000 USDT (at 2024-12-10 01:35:11)
1733808299 W single BUY 682700000 USDT (at 2024-12-10 05:24:59)
1733809571 W single BUY 103070000 USDT (at 2024-12-10 05:46:11)
1733823263 W single BUY 100035268 USDC (at 2024-12-10 09:34:23)
1733827811 W single BUY 188000000 USDT (at 2024-12-10 10:50:11)
1733829875 W single BUY 990000000 USDT (at 2024-12-10 11:24:35)
1733831435 W single BUY 239131209 USDT (at 2024-12-10 11:50:35)
1733831567 W single SELL 5470000000000000 WETH (at 2024-12-10 11:52:47)
1733837087 W single BUY 1043248700 USDT (at 2024-12-10 13:24:47)
1733842199 W single BUY 4923000000 USDT (at 2024-12-10 14:49:59)
1733852735 W single BUY 22082304898 USDT (at 2024-12-10 17:45:35)
1733853203 W single BUY 340240000 USDT (at 2024-12-10 17:53:23)
1733863355 W single BUY 122572044 USDC (at 2024-12-10 20:42:35)
1733863535 W single BUY 183506266 USDT (at 2024-12-10 20:45:35)
1733876159 W single BUY 100000000 USDC (at 2024-12-11 00:15:59)
1733877095 W single SELL 7