# Part I: Data Query

In [4]:
#@title Mount Drive
from google.colab import drive
drive.mount('/content/drive')
print('Authenticated')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Authenticated


In [5]:
cd /content/drive/MyDrive/Network Study/2023 Sep Revise/Data/

/content/drive/MyDrive/Network Study/2023 Sep Revise/Data


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Aave Transfer data querying (from Google Big Query) using Kaggle's public dataset BigQuery integration.

In [None]:
from google.cloud import bigquery
import numpy as np
import pandas as pd
import os
import time
import zipfile

from tqdm import tqdm
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "crypto_ethereum" dataset (https://www.kaggle.com/bigquery/ethereum-blockchain)
dataset_ref = client.dataset("crypto_ethereum", project="bigquery-public-data")

# API request - fetch the dataset
# Please replace dataset_ref with your own API key
dataset = client.get_dataset(dataset_ref)

# List all the tables in the "crypto_ethereum" dataset
tables = list(client.list_tables(dataset))

In [None]:
def query_to_csv(sql, output_path):
    df = client.query(sql).to_dataframe(progress_bar_type='tqdm_notebook')
    df.to_csv(output_path, mode='a', index=False, header=not os.path.exists(output_path), compression='gzip')

In [None]:
sql = '''
SELECT token_address, from_address, to_address,block_timestamp, cast(value AS NUMERIC) FROM
`bigquery-public-data.crypto_ethereum.token_transfers`
WHERE token_address = "0x7fc66500c84a76ad7e9c93437bfc5ac33e2ddae9"
'''
df = client.query(sql).to_dataframe(progress_bar_type='tqdm_notebook')

In [None]:
# data cleaning
df.rename(columns={'f0_':'value'}, inplace = True)
df = df.dropna()
df['value'] = df['value'].apply(lambda x: float(x))
df['timestamp'] = pd.to_datetime(df['block_timestamp'])
df['timestamp'] = df['timestamp'].apply(lambda x: str(x)[:10])
df['timestamp'] = pd.to_datetime(df['timestamp'])


df = df[df['timestamp']>'2020-10-09']
df = df[df['timestamp']<'2023-07-30']
df.head()

#### Output Raw Token Transfer Data

In [None]:
df.to_csv('Aave Raw Transfer Data.csv')

In [None]:
df = df.drop(columns = ['token_address','block_timestamp'])

In [None]:
## add values between the 2 same addresses together
df[['from_address', 'to_address']] = np.sort(df[['from_address', 'to_address']], axis=1)
df= df.groupby(['timestamp','from_address','to_address']).agg(lambda x: sum(x)).reset_index()
df.head()

In [None]:
df.to_csv('AAVE transaction data_after preprocessing.csv')

# Part II: Network Analysis

In [6]:
pwd

'/content/drive/MyDrive/Network Study/2023 Sep Revise/Data'

In [7]:
ls

'AAVE Merged Datasets.csv'                        core_date_cnt_type.csv
 AAVE_Network_Features.csv                        cp_test_results.csv
'Aave Raw Transfer Data.csv'                      Readme.md
'AAVE transaction data_after preprocessing.csv'   significant_test.csv
 core_addresses_appearance.csv


In [8]:
import pandas as pd
df = pd.read_csv('AAVE transaction data_after preprocessing.csv')

In [9]:
df_time_partition= df.groupby(['timestamp'])['to_address'].agg(['nunique']).reset_index()
df_time_partition = df_time_partition.drop(['nunique'], axis=1)

#### a. Number of daily edges and nodes

In [24]:
# Daily Network Building
import networkx as nx
num_nodes = []
num_edges = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]

    # MultiDi Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Number of nodes, number of edges
    nodes = G.number_of_nodes()
    edges = G.number_of_edges()
    num_nodes.append(nodes)
    num_edges.append(edges)

In [25]:
Network_Features={"num_nodes" : num_nodes,"num_edges" : num_edges}
Network_Features=pd.DataFrame(Network_Features)
Network_Features['time'] =  df_time_partition['timestamp']

#### b. Degree

In [26]:
import networkx as nx
Degreemean = []
Degreestd = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # MultiDi Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Degree_centrality, mean_value
    degrees = G.degree()
    degree = list(dict(G.degree()).values())
    df_deg = {"Degree" : degree}
    df_deg = pd.DataFrame(df_deg)
    DC_mean = df_deg['Degree'].mean()
    DC_std = df_deg['Degree'].std()
    Degreemean.append(DC_mean)
    Degreestd.append(DC_std)

In [27]:
Network_Features['Degree mean']  = Degreemean
Network_Features['Degree std']  = Degreestd

#### c. Top 10 addresses degree ratio

In [28]:
import networkx as nx
top10Degreemean = []
top10Degreestd = []

for i in range(0,len(df_time_partition)):

    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    sender_mdegree= df_1.groupby(['from_address'])['to_address'].count().reset_index()
    receiver_mdegree = df_1.groupby(['to_address'])['from_address'].count().reset_index()
    sender_mdegree = sender_mdegree.rename(columns={'to_address':'degree'})
    sender_mdegree = sender_mdegree.rename(columns={'from_address':'address'})
    receiver_mdegree = receiver_mdegree.rename(columns = {'from_address':'degree'})
    receiver_mdegree = receiver_mdegree.rename(columns = {'to_address':'address'})

    merge = pd.merge(sender_mdegree,receiver_mdegree,on="address",how = "outer")
    merge = merge.fillna(int(0))
    merge['degree'] = merge['degree_x']+merge['degree_y']

    merge.sort_values(by=['degree'], ascending=False, inplace=True)
    merge = merge.reset_index()
    top5degree = merge['address'][0:10].tolist()

    sen_top =  df_1[df_1['from_address'].isin(top5degree)]
    rec_top= df_1[df_1['to_address'].isin(top5degree)]

    topaddress = pd.concat([sen_top,rec_top]).drop_duplicates()

    G = nx.from_pandas_edgelist(topaddress, 'from_address', 'to_address', 'value', nx.Graph())
    # Calculation of absolute degree
    degree = []
    for j in range (0,10):
        degrees = G.degree(top5degree[j])
        degree.append(degrees)
    df_deg = {"Degree" : degree}
    df_deg = pd.DataFrame(df_deg)
    deg_mean = df_deg['Degree'].mean()
    deg_std = df_deg['Degree'].std()
    top10Degreemean.append(deg_mean)
    top10Degreestd.append(deg_std)

In [29]:
Network_Features['Top10Degree mean']  = top10Degreemean
Network_Features['Top10Degree std']  = top10Degreestd

In [30]:
Network_Features['Top10 Degree mean ratio']  = Network_Features['Top10Degree mean']/Network_Features['Degree mean']

#### d. Degree centrality

In [31]:
import networkx as nx
DCmean = []
DCstd = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # MultiDi Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Degree_centrality, mean_value
    deg_cen = nx.degree_centrality(G)
    df_deg = pd.DataFrame.from_dict(deg_cen, orient='index', columns=['Degree_Centrality'])
    DC_mean = df_deg['Degree_Centrality'].mean()
    DC_std = df_deg['Degree_Centrality'].std()
    DCmean.append(DC_mean)
    DCstd.append(DC_std)

#### e. Clustering coefficient

In [32]:
clustermean = []
clusterstd = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # Unweighted-Directed Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Clustering_Coefficient, mean_value, std
    clustering = nx.clustering(G)
    df_cluster = pd.DataFrame.from_dict(clustering, orient='index', columns=['Clustering_Coefficient'])
    cluster_mean = df_cluster['Clustering_Coefficient'].mean()
    cluster_std = df_cluster['Clustering_Coefficient'].std()
    clustermean.append(cluster_mean)
    clusterstd.append(cluster_std)

#### f. Modularity

In [11]:
!pip install python-louvain



In [10]:
import community.community_louvain as cl

In [None]:
import networkx as nx
import pandas as pd
import community
mod_list = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # unweighted-undirected Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of modularity
    part = cl.best_partition(G)
    mod = cl.modularity(part,G)
    mod_list.append(mod)

#### g. Transitivity

In [41]:
tran_list = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # Unweighted-undirected Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of transitivity,
    tran = nx.transitivity(G)
    tran_list.append(tran)

#### h. Eigenvector Centrality

In [None]:
eigmean = []
eigstd = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    #df_1 = actsenrec.loc[actsenrec['timestamp']==df_time_partition['timestamp'][i]]

    # MultiDi Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Closeness_centrality, mean_value
    eig_cen = nx.eigenvector_centrality(G, max_iter=20000)
    df_eig = pd.DataFrame.from_dict(eig_cen, orient='index', columns=['eigenvector_centrality'])
    eig_mean = df_eig['eigenvector_centrality'].mean()
    eig_std = df_eig['eigenvector_centrality'].std()
    eigmean.append(eig_mean)
    eigstd.append(eig_std)

#### i. Closeness Centrality

In [None]:
import networkx as nx
CCmean = []
CCstd = []
for i in range(0,len(df_time_partition)):

    # Data Partition
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]

    # MultiDi Network Building (weighted-directed graph)
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of Closeness_centrality, mean_value
    close_cen = nx.closeness_centrality(G)
    df_close = pd.DataFrame.from_dict(close_cen, orient='index', columns=['Closeness_Centrality'])
    CC_mean = df_close['Closeness_Centrality'].mean()
    CC_std = df_close['Closeness_Centrality'].std()
    CCmean.append(CC_mean)
    CCstd.append(CC_std)

#### j. Number of components

In [None]:
import networkx as nx
components_cnt = []
for i in range(0,len(df_time_partition)):
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())
    com_cnt = nx.number_connected_components(G)
    components_cnt.append(com_cnt)

#### k. Size of gaint component / num of nodes

In [None]:
import networkx as nx
giant_com_ratio = []
for i in range(0,len(df_time_partition)):
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())
# G = nx.Graph()
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G0 = G.subgraph(Gcc[0])
#com_cnt = nx.number_connected_components(G)
#components_cnt.append(com_cnt)
    nodes = G0.number_of_nodes()
    nodes_whole = G.number_of_nodes()
    ratio = nodes/nodes_whole
    giant_com_ratio.append(ratio)

In [None]:
Network_Features['DCmean']=DCmean
Network_Features['DCstd']=DCstd
Network_Features['clustermean']=clustermean
Network_Features['clusterstd']=clusterstd
Network_Features['modularity']=mod_list
Network_Features['transitivity']=tran_list
Network_Features['eig_mean']=eigmean
Network_Features['eig_std']=eigstd
Network_Features['closenessmean']=CCmean
Network_Features['closenessstd']=CCstd
Network_Features['Components_cnt']=components_cnt
Network_Features['giant_com_ratio']=giant_com_ratio

Network_Features['token'] =  'AAVE'

In [None]:
Network_Features.head()

#### Output network features dataset

In [None]:
Network_Features.to_csv('AAVE_Network_Features.csv')

# Part III: Core-periphery Analysis

In [None]:
pip install cpnet

In [None]:
import cpnet
import networkx as nx
import matplotlib.pyplot as plt

#### Basic structure significance test

In [None]:
df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][135]]
G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())


alg = cpnet.BE()
alg.detect(G)
c = alg.get_pair_id()
x = alg.get_coreness()  # Get the coreness of nodes

##coreness = pd.DataFrame.from_dict(x, orient='index', columns=['Coreness'])
##corenessmean = coreness['Coreness'].mean()
#corenessmean

sig_c, sig_x, significant, p_values = cpnet.qstest(
    c, x, G, alg, significance_level=0.05, num_of_rand_net=100, num_of_thread=16)

In [None]:
print(significant)
print(p_values)

#### Continuous structure significance test

In [None]:
df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][90]]
G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())

    # Calculation of mean&std of coreness (continuous structure)
alg = cpnet.MINRES()
alg.detect(G)
x = alg.get_coreness()

#coreness = pd.DataFrame.from_dict(x, orient='index', columns=['Coreness'])
#corenessmean = coreness['Coreness'].mean()
#corenessstd = coreness['Coreness'].std()
#print ('mean', corenessmean)
#print ('std', corenessstd)

sig_c, sig_x, significant, p_values = cpnet.qstest(
    c, x, G, alg, significance_level=0.05, num_of_rand_net=100, num_of_thread=16)

In [None]:
print(significant)
print(p_values)

#### Core-periphery Network Graph

In [None]:
pos = nx.spiral_layout(G,scale = 3)
fig = plt.figure(figsize=(14, 12))
ax = plt.gca()
draw_nodes_kwd = {"node_size": 80, "linewidths": 0.8}
ax, pos = cpnet.draw(G, sig_c, sig_x, ax,draw_nodes_kwd=draw_nodes_kwd,
                     layout_kwd = {"verbose":True, "iterations":500})

#### Output core addresses and corresponding date counts

In [None]:
core_address = []
a = 0
for i in range(0,len(df_time_partition)):
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())
    alg = cpnet.BE()
    alg.detect(G)
    c = alg.get_pair_id()
    x = alg.get_coreness()

    coredf = pd.DataFrame.from_dict(x, orient='index',columns=['coreness'])
    core = coredf[coredf['coreness']==1].index.tolist()
    core_address.extend(core)
    a+=1
    print(a)

In [None]:
cores = pd.DataFrame(core_address)
core_cnt = cores[0].value_counts(ascending=False).reset_index()
core_cnt

In [None]:
core_cnt.to_csv('core_date_cnt.csv')

#### Number of core members each day

In [None]:
core_cnt = []
for i in range(0,len(df_time_partition)):
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][i]]
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())
    alg = cpnet.BE()
    alg.detect(G)
    c = alg.get_pair_id()
    x = alg.get_coreness()

    coredf = pd.DataFrame.from_dict(x, orient='index',columns=['coreness'])
    core = coredf[coredf['coreness']==1].index.tolist()
    cnt = len(core)
    core_cnt.append(cnt)

#### Average number of neighbors of cores

In [None]:
from numpy import *
avg_core_neighbor = []

for i in range(0,len(df_time_partition)):
    df_1 = df.loc[df['timestamp']==df_time_partition['timestamp'][0]]
    G = nx.from_pandas_edgelist(df_1, 'from_address', 'to_address', 'value', nx.Graph())
    alg = cpnet.BE()
    alg.detect(G)
    c = alg.get_pair_id()
    x = alg.get_coreness()

    coredf = pd.DataFrame.from_dict(x, orient='index',columns=['coreness'])
    core = coredf[coredf['coreness']==1].index.tolist()

    neighbor_cnt = []
    for i in range (0,len(core)):
        neighbor = G.degree(core[i])
        neighbor_cnt.append(neighbor)

    neighbor_cnt_mean = mean(neighbor_cnt)
    avg_core_neighbor.append(neighbor_cnt_mean)

#### Update Network Features dataset

In [None]:
Network_Features = pd.read_csv('AAVE_Network_Features.csv')
Network_Features['core_cnt']=core_cnt
Network_Features['core_ratio']=Network_Features['core_cnt']/Network_Features['num_nodes']
Network_Features['avg_core_neighbor']=avg_core_neighbor

In [None]:
#Deal with the outlier
# average of past 5 days
Network_Features['core_cnt'][230]=2
Network_Features['core_ratio'][230] = 2/df_AAVE['num_nodes'][230]
Network_Features['avg_core_neighbor'][230] = 217.8

In [None]:
Network_Features.to_csv('AAVE_Network_Features.csv')