Label 1 -> illicit
Label 2 -> licit

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy as sp

import community as louvain
from networkx.algorithms.community import girvan_newman

In [30]:
classes = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
edges = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
features = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_features.csv')

In [3]:
classes.head()

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


In [4]:
edges.head()

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206


In [5]:
features.head()

Unnamed: 0,230425980,1,-0.1714692896288031,-0.18466755143291433,-1.2013688016765636,-0.12196959975910057,-0.04387454791734898,-0.11300200928476244,-0.06158379407303222,-0.16209679981659642,...,-0.5621534802884299,-0.6009988905192808,1.4613303209554889,1.4613689382001922,0.01827940003744589,-0.0874901561101501,-0.13115530389558736,-0.09752359377152515,-0.12061340670311574,-0.11979245961251665
0,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
2,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
3,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117
4,230459870,1,0.96104,-0.081127,-1.201369,1.303743,0.333276,1.480381,-0.061584,-0.163577,...,-0.504702,-0.422589,-0.22679,-0.117629,0.018279,0.277775,0.413931,1.149556,-0.696053,-0.69554


In [45]:
# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

features = features.rename(columns=colNames)
features

Unnamed: 0,230425980,1,-0.1714692896288031,-0.18466755143291433,-1.2013688016765636,-0.12196959975910057,-0.04387454791734898,-0.11300200928476244,-0.06158379407303222,-0.16209679981659642,...,-0.5621534802884299,-0.6009988905192808,1.4613303209554889,1.4613689382001922,0.01827940003744589,-0.0874901561101501,-0.13115530389558736,-0.09752359377152515,-0.12061340670311574,-0.11979245961251665
0,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
2,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
3,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
4,230459870,1,0.961040,-0.081127,-1.201369,1.303743,0.333276,1.480381,-0.061584,-0.163577,...,-0.504702,-0.422589,-0.226790,-0.117629,0.018279,0.277775,0.413931,1.149556,-0.696053,-0.695540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203763,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203764,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203765,158375402,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163626,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203766,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


In [40]:
# Pass unknown to number 3
classes.loc[classes['class'] == 'unknown', 'class'] = 3
print('Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.\n')
print('Shape of classes', classes.shape)
print('Shape of edges', classes.shape)
print('Shape of features', features.shape)

Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.

Shape of classes (203769, 2)
Shape of edges (203769, 2)
Shape of features (203768, 167)


In [41]:
classes.groupby(['class']).count()

Unnamed: 0_level_0,txId
class,Unnamed: 1_level_1
3,157205
1,4545
2,42019


In [42]:
features.shape
classes.shape

(203769, 2)

In [44]:
# Merge the DataFrames on the column 'source', assuming it's the same name in both DataFrames
df_merged = pd.merge(features, classes, on='txId', how='left')
df_merged.head()

KeyError: 'txId'

In [29]:
generate_graph = True
if generate_graph:
    # Create an empty graph
    G = nx.Graph()

    for _, row in df_merged.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        
        # Add node to the graph with its attributes
        G.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in edges.iterrows():
        G.add_edge(row['txId1'], row['txId2'])

NameError: name 'df_merged' is not defined

In [18]:
def detect_communities(g, method):
    if isinstance(g, nx.DiGraph):
      g = nx.Graph(g)


    if method == 'girvan-newman':
      communities_generator = girvan_newman(g)
      print(communities_generator)
      communities = []
      for community in communities_generator:
        print(community)
        communities.append([list(c) for c in community])

      total_modularity = 0
      for i in range(len(communities)):
        modularity = nx.algorithms.community.modularity(g, communities[i])
        total_modularity += modularity

      total_modularity /= len(communities)
      return communities, total_modularity

    elif method == 'louvain':
      louvain = louvain.best_partition(g)
      communities = [[] for _ in range(max(louvain.values()) + 1)]
      for node, community_id in louvain.items():
          communities[community_id].append(node)

      modularity = nx.algorithms.community.modularity(g, communities)
      return communities, modularity

    else:
      print('Entered an invalid method')
      return None

In [19]:
communities_girvan, modularities = detect_communities(graph, method='girvan-newman')

<generator object girvan_newman at 0x000001E8D95255D0>


KeyboardInterrupt: 