Label 1 -> illicit
Label 2 -> licit

In [110]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy as sp
import pickle as pkl

import community as louvain
from networkx.algorithms.community import girvan_newman

In [35]:
df_classes = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
df_edges = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
df_features = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)

In [101]:
df_classes

Unnamed: 0,txId,class
0,230425980,3
1,5530458,3
2,232022460,3
3,232438397,2
4,230460314,3
...,...,...
203764,173077460,3
203765,158577750,3
203766,158375402,1
203767,158654197,3


In [100]:
df_edges

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206
...,...,...
234350,158365409,157930723
234351,188708874,188708879
234352,157659064,157659046
234353,87414554,106877725


In [36]:
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203765,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203766,158375402,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163626,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203767,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


In [37]:
# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

df_features = df_features.rename(columns=colNames)
df_features

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203765,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203766,158375402,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163626,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203767,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


In [38]:
# Pass unknown to number 3
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 3
print('Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.\n')
print('Shape of classes', df_classes.shape)
print('Shape of edges', df_edges.shape)
print('Shape of features', df_features.shape)

Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.

Shape of classes (203769, 2)
Shape of edges (234355, 2)
Shape of features (203769, 167)


In [39]:
df_classes.groupby(['class']).count()

Unnamed: 0_level_0,txId
class,Unnamed: 1_level_1
3,157205
1,4545
2,42019


In [40]:
print(df_features.shape)
print(df_classes.shape)

(203769, 167)
(203769, 2)


In [41]:
# Merge the DataFrames on the column 'source', assuming it's the same name in both DataFrames
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_merged

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,3
3,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
203765,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,3
203766,158375402,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163626,...,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,1
203767,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,3


In [67]:
sub_df_features = df_merged.query(" `Time step` == 29")
sub_df_features

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,class
116529,10278231,29,0.268456,0.127770,1.573595,-0.121970,0.611176,-0.113002,-0.061584,0.288020,...,0.807542,0.190379,0.401797,-0.098889,0.335448,0.193816,-0.008915,-0.562507,-0.795034,3
116530,163809013,29,-0.160498,4.081258,2.128587,5.956069,-0.063725,6.260528,-0.061584,-0.163646,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,2
116531,102614111,29,-0.172910,-0.161607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163571,...,-0.493772,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,3
116532,168218829,29,2.206400,0.347772,1.573595,1.078631,-0.043875,-0.113002,4.807154,-0.163635,...,-0.613614,0.241128,0.241406,0.018279,0.104754,0.333655,0.333211,-1.760926,-1.760984,3
116533,163670715,29,-0.167215,-0.107012,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.157742,...,-0.380239,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120799,165837468,29,0.691417,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,0.720766,...,-0.588384,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
120800,163707645,29,-0.170553,-0.129773,1.018602,-0.046932,-0.043875,-0.029140,-0.061584,-0.163643,...,2.918551,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,3
120801,163832324,29,-0.170382,-0.158783,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.160984,...,-0.367624,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,3
120802,163803832,29,0.440243,-0.055242,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,0.463779,...,-0.014407,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,2


In [102]:
sub_df_edges = pd.DataFrame(columns=df_edges.columns)
row_index = 0

for _, row in sub_df_features.iterrows():
    id = row['txId']
    all_edges = df_edges[df_edges['txId1'] == id]
    for _, row2 in all_edges.iterrows():
        l_values = [row2['txId1'], row2['txId2']]
        sub_df_edges.loc[len(sub_df_edges)] = l_values
        row_index+=1

In [103]:
sub_df_edges

Unnamed: 0,txId1,txId2
0,10278231,163809013
1,10278231,163711539
2,10278231,163705606
3,10278231,163826643
4,10278231,163823470
...,...,...
4536,163656789,163656773
4537,165837468,165837473
4538,163707645,163879834
4539,163832324,163832295


In [104]:
generate_graph = True

if generate_graph:
    # Create an empty graph
    subgraph_29 = nx.Graph()

    for _, row in sub_df_features.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        
        # Add node to the graph with its attributes
        subgraph_29.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in sub_df_edges.iterrows():
        subgraph_29.add_edge(row['txId1'], row['txId2'])

In [111]:
# Save the graph as a pickle file
with open("elipticData_subgraph29.pkl", "wb") as f:
    pkl.dump(subgraph_29, f)


# Specify the path to your pickle file
pickle_file_path = 'C:/Users/Usuario/Desktop/UAB/Tercer/2n cuatri/Synthesis project/GraphAnomaly/elipticData_subgraph29.pkl'

# Open the pickle file in binary mode
with open(pickle_file_path, 'rb') as f:
    # Load the data from the pickle file
    subgraph_29 = pkl.load(f)

In [114]:
# Get the number of nodes
num_nodes = nx.number_of_nodes(subgraph_29)

# Get the number of edges
num_edges = nx.number_of_edges(subgraph_29)

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 4275
Number of edges: 4541


In [107]:
def detect_communities(g, method):
    if isinstance(g, nx.DiGraph):
      g = nx.Graph(g)


    if method == 'girvan-newman':
      communities_generator = girvan_newman(g)
      print(communities_generator)
      communities = []
      for community in communities_generator:
        print(community)
        communities.append([list(c) for c in community])

      total_modularity = 0
      for i in range(len(communities)):
        modularity = nx.algorithms.community.modularity(g, communities[i])
        total_modularity += modularity

      total_modularity /= len(communities)
      return communities, total_modularity

    elif method == 'louvain':
      louvain_community = louvain.best_partition(g)
      communities = [[] for _ in range(max(louvain_community.values()) + 1)]
      for node, community_id in louvain_community.items():
          communities[community_id].append(node)

      modularity = nx.algorithms.community.modularity(g, communities)
      return communities, modularity

    else:
      print('Entered an invalid method')
      return None

In [115]:
communities_girvan, modularities = detect_communities(subgraph_29, method='girvan-newman')

<generator object girvan_newman at 0x000001AC8481C6D0>
