In [96]:
import pandas as pd
import requests
import datetime
from sklearn.preprocessing import LabelEncoder

api_key = 'api-key'

url = "https://api-optimistic.etherscan.io/api"

le = LabelEncoder()

In [97]:
def first_last_info(address):

    params = {
        "module": "account",
        "action": "txlist",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }

    response = requests.get(url, params=params).json()
    response = response['result']
    first_date = response[0]['timeStamp']
    last_date = response[-1]['timeStamp']

    first_to = response[0]['to']
    first_from = response[0]['from']

    last_to = response[-1]['to']
    last_from = response[-1]['from']

    if first_to == address:
        first_to = 'self'
    if first_from == address:
        first_from = 'self'
    if last_to == address:
        last_to = 'self'
    if last_from == address:
        last_from = 'self'

    return [first_date, last_date, first_from, first_to, last_from, last_to]

In [98]:
def get_transaction_history(address):

    params = {
        "module": "account",
        "action": "txlist",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }

    response = requests.get(url, params=params).json()
    return response['result']
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


def get_Erc20_transaction_history(address):

    params = {
        "module": "account",
        "action": "tokentx",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }

    response = requests.get(url, params=params).json()
    return response['result']
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


def get_wallet_age(history: list[dict]):
    if len(history) > 0:
        creation_time = int(history[0]['timeStamp'])
        creation_date = datetime.datetime.fromtimestamp(creation_time).date()
        current_date = datetime.date.today()
        wallet_age = (current_date - creation_date).days
        return wallet_age
    else:
        return 0
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


def to_and_from(history: list[dict], address):
    from_count = 0
    to_count = 0
    for transactions in history:
        if transactions['from'] == address:
            from_count += 1
        else:
            to_count += 1
    return from_count, to_count
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


def fetch(address, nested_list):

    reg_hist = get_transaction_history(address)
    trasacting_hist = first_last_info(address)
    erc20_hist = get_Erc20_transaction_history(address)

    txn_count = len(reg_hist)

    reg_age = get_wallet_age(reg_hist)
    erc_age = get_wallet_age(erc20_hist)

    reg_to, reg_from = to_and_from(reg_hist, address)
    erc_to, erc_from = to_and_from(erc20_hist, address)

    row = [address, txn_count, reg_age, erc_age, reg_to,
           reg_from, erc_to, erc_from] + trasacting_hist

    nested_list.append(row)

In [99]:
votes = pd.read_csv(
    'votes_features_citizen_0x984e29dCB4286c2D9cbAA2c238AfDd8A191Eefbc.csv')
original = pd.read_csv('Gitcoin Citizens  Round #1_ Retroactive funding .csv')


votes['block_timestamp'] = pd.to_datetime(
    votes['block_timestamp'], format="%Y-%m-%dT%H:%M:%S.%fZ")
votes = votes[['block_timestamp', 'tx_hash', 'voter', 'project', 'amount_usd']]
votes.rename(columns={'block_timestamp': 'utc_time', 'amount_usd': 'amountUSD',
             'tx_hash': 'transaction', 'project': 'grantAddress'}, inplace=True)
votes.sort_values(by=['utc_time'], inplace=True)

In [100]:
original['grantAddress'] = original['grantAddress'].str.lower()
votes['grantAddress'] = votes['grantAddress'].str.lower()

mapping_dict = dict(zip(original['grantAddress'], original['title']))

votes['title'] = votes['grantAddress'].map(mapping_dict)

In [101]:
# Uncomment to run queries
# headers = ['voter','txn_count','Wallet_Age','Wallet_Age(Erc20)','to_count','from_count','erc_to','erc_from','first_date','last_date','first_from','first_to','last_from','last_to']
# contents = []

# count = 0
# for i in votes['voter'].unique():
#     print(count)
#     fetch(i,contents)
#     count+=1

# data = pd.DataFrame(contents,columns=headers)
# data.to_csv('queried_data.csv',index=False)

In [102]:
queried = pd.read_csv('queried_data.csv')

In [103]:
address_counts = votes['voter'].value_counts()
Address_info_sybil = pd.DataFrame(
    {'Address': address_counts.index, 'Count': address_counts.values})

funding_counts = votes['voter'].value_counts()
count_by_address_project = votes.groupby(
    ['voter', 'grantAddress']).size().reset_index(name='count')
no_grants_funded = count_by_address_project['voter'].value_counts()

# create a new dataframe with the address counts as a column
Address_info1 = pd.DataFrame(
    {'voter': funding_counts.index, 'Funding_count': funding_counts.values})
Address_info2 = pd.DataFrame(
    {'voter': no_grants_funded.index, 'No_Citizens_Funded': no_grants_funded.values})
Address_info = pd.merge(Address_info1, Address_info2)

In [104]:
filtered_votes = pd.merge(Address_info, votes, how='left', on='voter')

In [105]:
citizen_points = ['voter', 'Funding_count', 'No_Citizens_Funded', 'utc_time']

In [106]:
filtered_votes['address'] = filtered_votes['voter']

filtered_votes['project_title_sorted'] = filtered_votes['title'].apply(
    lambda x: '-'.join(sorted(x.lower().split())))

# group the rows by the address value and apply aggregation functions to the columns
df_result = filtered_votes.groupby('address').agg({'voter': 'first',
                                                   'project_title_sorted': '_'.join}).reset_index()

# sort the resulting DataFrame by the count of project titles in descending order
df_result = df_result.sort_values(by='project_title_sorted', ascending=False)[
    ['voter', 'project_title_sorted']]

cut_filtered_citizens = filtered_votes[citizen_points].drop_duplicates(subset=[
                                                                       'voter'])

cultivate_data_citizen = pd.merge(
    cut_filtered_citizens, df_result, on='voter', how='left')

In [107]:
cultivate_data_citizen = cultivate_data_citizen.loc[(cultivate_data_citizen['No_Citizens_Funded'] <= 7) & (cultivate_data_citizen['Funding_count'] <= 7)]\
    .reset_index(drop=True)

In [108]:
cultivate_data_citizen = pd.merge(cultivate_data_citizen, queried, on='voter')

In [109]:
cultivate_data_citizen.dropna(inplace=True)

In [110]:
cultivate_data_citizen.sort_values(by='utc_time', inplace=True)
cultivate_data_citizen['unix_timestamp'] = cultivate_data_citizen['utc_time'].apply(lambda x: int(x.timestamp()))
cultivate_data_citizen.set_index("utc_time", inplace=True)


In [111]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [129]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['project_title_sorted',
                     'first_from', 'first_to', 'last_from', 'last_to']

for col in columns_to_encode:
    le = LabelEncoder()
    cultivate_data_citizen[col] = le.fit_transform(cultivate_data_citizen[col])

columns_to_encode = ['project_title_sorted',
                     'first_from', 'first_to', 'last_from', 'last_to']

for col in columns_to_encode:
    le = LabelEncoder()
    cultivate_data_citizen[col] = le.fit_transform(cultivate_data_citizen[col])

In [160]:
address = set()

In [161]:
from sklearn.preprocessing import MinMaxScaler
m = MinMaxScaler()

In [207]:
data = m.fit_transform(np.array(cultivate_data_citizen[cultivate_data_citizen.columns[1:]]))

# Add supporterwallet column to data
voter = cultivate_data_citizen['voter'].values.reshape(-1, 1)
data = np.hstack((voter, data))

similarity_matrix = cosine_similarity(data[:, 1:])

# Set threshold for grouping together similar rows
threshold = 0.9999999

# Initialize list to store similar rows
similar_rows = []

# Loop through similarity matrix and group together similar rows
for i in range(len(similarity_matrix)):
    similar_row_indices = np.where(similarity_matrix[i] >= threshold)[0]
    if len(similar_row_indices) > 1:
        similar_row_values = [tuple(cultivate_data_citizen.iloc[j])
                              for j in similar_row_indices]
        if similar_row_values not in similar_rows:
            similar_rows.append(similar_row_values)


# Print out the similar rows
import json

similar_rows_json = {}

# Loop through similarity matrix and group together similar rows
for i, row_group in enumerate(similar_rows):
    cluster_group = []
    for row in row_group:
        cluster_group.append(row[0])
        if row[0] not in address:
            address.add(row[0])
    similar_rows_json[f"Cluster Group {i}"] = cluster_group

# Serialize the similar_rows_json to a JSON file
with open('sybil_clusters.json', 'w') as file:
    json.dump(similar_rows_json, file,indent=1)


In [175]:
from datasets import load_dataset
path = 'Poupou/citizen-round-features'
ds = load_dataset(path=path)
df = ds['train'].to_pandas()
flagged = df['address'].loc[df['flagged'] == True]

Found cached dataset csv (C:/Users/user/.cache/huggingface/datasets/Poupou___csv/Poupou--citizen-round-features-d56265baa6629651/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [203]:
in_ = 0
for i in flagged.unique():
    if i in address:
        in_ += 1

in_