# Graph Anomaly Detection


### Processing and analyzing training data

## Load data

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle as pkl

In [2]:
# Read files
path = "C:/Users/gsamp/OneDrive/Documents/AI-3/2n Semestre/Projecte de Síntesi 2/GraphAnomaly/dades_guillem/"
df_classes = pd.read_csv(path + "elliptic_txs_classes.csv") # Nodes' labels
df_edges = pd.read_csv(path + "elliptic_txs_edgelist.csv") # Edges
df_features = pd.read_csv(path + "elliptic_txs_features.csv", header=None) # Nodes' features

In [3]:
df_classes.head()

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


In [4]:
# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

df_features = df_features.rename(columns=colNames)

In [5]:
# Pass unknown to number 3
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 3
print('Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.\n')
print('Shape of classes', df_classes.shape)
print('Shape of edges', df_edges.shape)
print('Shape of features', df_features.shape)

Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.

Shape of classes (203769, 2)
Shape of edges (234355, 2)
Shape of features (203769, 167)


## Data visualization

In [6]:
df_classes.groupby(['class']).count()

Unnamed: 0_level_0,txId
class,Unnamed: 1_level_1
3,157205
1,4545
2,42019


In [7]:
df_features.shape
df_classes.shape

(203769, 2)

In [8]:
# Merge the DataFrames on the column 'source', assuming it's the same name in both DataFrames
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_merged.head()

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,3
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,3


In [9]:
generate_graph = True
if generate_graph:
    # Create an empty graph
    G = nx.Graph()

    for _, row in df_merged.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        
        # Add node to the graph with its attributes
        G.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in df_edges.iterrows():
        G.add_edge(row['txId1'], row['txId2'])

In [83]:

# Save the graph as a pickle file
with open("elipticData_graph.pkl", "wb") as f:
    pkl.dump(G, f)


# Specify the path to your pickle file
pickle_file_path = 'C:\\Users\\gsamp\\OneDrive\\Documents\\AI-3\\2n Semestre\\Projecte de Síntesi 2\\GraphAnomaly\\elipticData_graph.pkl'

# Open the pickle file in binary mode
with open(pickle_file_path, 'rb') as f:
    # Load the data from the pickle file
    G = pkl.load(f)


In [55]:
# Get the number of nodes
num_nodes = nx.number_of_nodes(G)

# Get the number of edges
num_edges = nx.number_of_edges(G)

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 203769
Number of edges: 234355


Creating subgraphs for each time step

In [84]:
# Specify the feature name and the desired feature value
time_step = 'Time step'

for value in  range(max(df_features['Time step'])):
    # Create a list of nodes that have the desired value in the specified feature
    desired_nodes = [node for node, data in G.nodes(data=True) if data.get(time_step) == value+1]
    sub_G = G.subgraph(desired_nodes)

    num_nodes = nx.number_of_nodes(sub_G)

    # Get the number of edges
    num_edges = nx.number_of_edges(sub_G)
    
    

In [97]:
class_distr = {}
for node in G.nodes():
    if G.nodes[node]['Time step'] not in class_distr.keys():
        class_distr[G.nodes[node]['Time step']] = []
        class_distr[G.nodes[node]['Time step']].append(int(G.nodes[node]['class']))
    else:
        class_distr[G.nodes[node]['Time step']].append(int(G.nodes[node]['class']))


In [100]:
class_distr

{1: [3,
  3,
  3,
  '2',
  3,
  3,
  3,
  3,
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  3,
  '2',
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  3,
  '2',
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  3,
  '2',
  3,
  '2',
  3,
  '2',
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  '2',
  3,
  3,
  '2',
  3,
  3,
  '2',
  3,
  3,
  3,
  '2',
  3,
  3,
  3,
  3,
  3,
  '2',
  3,
  3,
  '2',
  '2',
  3,
  '2',
  3,
  3,
  '2',
  '2',
  3,
  '2',
  '2',
  '2',
  3,
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  '2',
  '2',
  '2',
  '2',
  3,
  '2',
  3,
  3,
  '2',
  '2',
  '2',
  3,
  '2',
  3,
  '2',
  '2',
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  '2',
  3,
  '2',
  '2',
  '2',
  3,
  3,
  3,
  '2',
  3,
  3,
  3,
  3,
  '2'

In [99]:
for time_step in class_distr.keys():
    print(f"Time step: {time_step} has: ")
    # print(len([x for x in class_distr[time_step] if x == 1]))
    print(class_distr[time_step].count(2))
    

Time step: 1 has: 
0
Time step: 2 has: 
0
Time step: 3 has: 
0
Time step: 4 has: 
0
Time step: 5 has: 
0
Time step: 6 has: 
0
Time step: 7 has: 
0
Time step: 8 has: 
0
Time step: 9 has: 
0
Time step: 10 has: 
0
Time step: 11 has: 
0
Time step: 12 has: 
0
Time step: 13 has: 
0
Time step: 14 has: 
0
Time step: 15 has: 
0
Time step: 16 has: 
0
Time step: 17 has: 
0
Time step: 18 has: 
0
Time step: 19 has: 
0
Time step: 20 has: 
0
Time step: 21 has: 
0
Time step: 22 has: 
0
Time step: 23 has: 
0
Time step: 24 has: 
0
Time step: 25 has: 
0
Time step: 26 has: 
0
Time step: 27 has: 
0
Time step: 28 has: 
0
Time step: 29 has: 
0
Time step: 30 has: 
0
Time step: 31 has: 
0
Time step: 32 has: 
0
Time step: 33 has: 
0
Time step: 34 has: 
0
Time step: 35 has: 
0
Time step: 36 has: 
0
Time step: 37 has: 
0
Time step: 38 has: 
0
Time step: 39 has: 
0
Time step: 40 has: 
0
Time step: 41 has: 
0
Time step: 42 has: 
0
Time step: 43 has: 
0
Time step: 44 has: 
0
Time step: 45 has: 
0
Time step: 46 has: 