In [1]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup

OUTPUT_FILE = "C:/Users/YuweiCao/Documents/GitHub/Project/Project/etherscan/result"
ADDRESS = "0x5be9a4959308A0D0c7bC0870E319314d8D957dBB"


In [2]:
transfers_df = pd.read_csv(OUTPUT_FILE + "/all_erc20_transfers.csv")
# 分别统计 from 和 to 地址
from_address_counts = transfers_df["from"].value_counts().reset_index()
from_address_counts.columns = ["address", "from_count"]

to_address_counts = transfers_df["to"].value_counts().reset_index()
to_address_counts.columns = ["address", "to_count"]

address_counts = pd.merge(
    from_address_counts, to_address_counts, on="address", how="outer"
).fillna(0)
# merge and convert to int

address_counts["from_count"] = address_counts["from_count"].astype(int)
address_counts["to_count"] = address_counts["to_count"].astype(int)

print(address_counts)
address_counts.to_csv("from_to_address_counts.csv", index=False)


                                       address  from_count  to_count
0   0x0478aa0f766bbcfd0882ad8ae5df5f0531f06065           2         0
1   0x119ca458c5f91d3815e5f8a29633d709569b4eb3           0         1
2   0x1246e490308db61dca45ead613f47430de3830ad           0         2
3   0x2887f96c5b3dc08dbe1312017310b2f26b228e31           0         2
4   0x2ac62e416ce9fc52394a045b66dee9dead4be053           1         0
5   0x2d732631d248006f94b52af8fd0a610c85894d6a           1         0
6   0x33c59f63771f2ab4c9f20c3955526416f02528ca           0         2
7   0x3b54f4fbfafd0e7167dcd70594e2d1b8bcbfe4e9           0         2
8   0x4125af68487bec121a2b90c0e15d824b53899533           1         0
9   0x4ac3bf54fbbf481c991d2f04bad2ff6393d9eb42           1         0
10  0x5851a4148233d84fcbf255109f99bbdd6c8c9cba           0         2
11  0x5be9a4959308a0d0c7bc0870e319314d8d957dbb         112      4151
12  0x6ae4eb64fd04e36a006969135f5013cbb0c15285           1         0
13  0x6e3382c4c4a69a3a62b489a456b3

In [3]:
address_counts["total_count"] = address_counts["from_count"] + address_counts["to_count"]

address_threshold = 10  # 设定阈值
important_addresses = address_counts[address_counts["total_count"] > address_threshold]
important_addresses = important_addresses.sort_values(by="total_count", ascending=False)
important_addresses = important_addresses[important_addresses["address"] != ADDRESS.lower()]
print(important_addresses)

important_addresses["etherscan_url"] = important_addresses["address"].apply(
    lambda x: f"https://etherscan.io/address/{x}"
)

print(important_addresses[[ "etherscan_url"]])

def get_public_name_tag(address):
    url = f"https://etherscan.io/address/{address}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    # print(response.text[:500])
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.title.string
        if title:
            # 提取标题中的 Name Tag 部分
            name_tag = title.split('|')[0].strip()  # 取 "|" 前的部分
            name_tag = name_tag.split('\n')[0].strip() # 只要换行符之前的内容
            return name_tag
        else:
            return "Unknown"
    else:
        return f"Error fetching data: {response.status_code}"
    
# 测试代码   
# address = "0x9008d19f58aabd9ed0d60971565aa8510560ab41"
# name_tag = get_public_name_tag(address)
# print(f"Public Name Tag for {address}: {name_tag}")    

important_addresses["name"] = important_addresses["address"].apply(get_public_name_tag)

print(important_addresses[["address", "name", "etherscan_url"]])

                                       address  from_count  to_count  \
31  0xe217e15b3c19cc0427f9492dc3bcfe8220afad10        4097         1   
20  0x9008d19f58aabd9ed0d60971565aa8510560ab41          40        73   

    total_count  
31         4098  
20          113  
                                        etherscan_url
31  https://etherscan.io/address/0xe217e15b3c19cc0...
20  https://etherscan.io/address/0x9008d19f58aabd9...
                                       address                          name  \
31  0xe217e15b3c19cc0427f9492dc3bcfe8220afad10   TransparentUpgradeableProxy   
20  0x9008d19f58aabd9ed0d60971565aa8510560ab41  CoW Protocol: GPv2Settlement   

                                        etherscan_url  
31  https://etherscan.io/address/0xe217e15b3c19cc0...  
20  https://etherscan.io/address/0x9008d19f58aabd9...  


In [None]:
def get_top_accounts(page=1):
    """
    get top accounts data from etherscan.io, store them as a list
    """
    url = f"https://etherscan.io/accounts?p={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", class_="table")
        rows = table.find_all("tr")[1:]

        accounts = []
        for row in rows:
            cols = row.find_all("td")
            address_tag = cols[1].find("a")  # 定位到地址的 <a> 标签

            # 提取完整地址
            full_address = address_tag["href"].split("/")[-1]  # 从 href 提取完整地址
            name_tag = cols[2].text.strip()  # 提取 Name Tag
            balance = cols[3].text.strip()
            percentage = cols[4].text.strip()
            txn_count = cols[5].text.strip()

            accounts.append({
                "Rank": cols[0].text.strip(),
                "Address": full_address,
                "Name Tag": name_tag,
                "Balance": balance,
                "Percentage": percentage,
                "Txn Count": txn_count,
            })
        return accounts
    else:
        print(f"Failed to fetch page {page}, status code: {response.status_code}")
        return []

def scrape_top_accounts(max_pages=5):
    """
    scrape top accounts data from etherscan.io, store them as a list
    """
    all_accounts = []
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        accounts = get_top_accounts(page)
        all_accounts.extend(accounts)
        time.sleep(2)  # 避免请求过于频繁
    return all_accounts

# Storing the last 10,000 top accounts only
top_accounts = scrape_top_accounts(max_pages=400)

top_accounts_df = pd.DataFrame(top_accounts)
top_accounts_df.to_csv("top_accounts.csv", index=False)
print("Top accounts data saved to top_accounts.csv")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [7]:
top_accounts_df = pd.read_csv(OUTPUT_FILE + "/top_accounts.csv")
matching_rows = important_addresses[important_addresses["address"].isin(top_accounts_df["Address"])]

print("Matching rows:")
print(matching_rows)

Matching rows:
Empty DataFrame
Columns: [address, from_count, to_count, total_count, etherscan_url, name]
Index: []
