# Graph Anomaly Detection


## Pre-processing data

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle as pkl

### Load data

In [7]:
# Read files
path = "dades_arnau/"
df = pd.read_csv(path + "train_set.csv")
with open(path + "TrainSet_Grpah.pkl", 'rb') as f:    
    G = pkl.load(f)


## Graph analysis

#### Working with timesteps

In [8]:
# Create a subgraph for the chosen time step
desired_nodes = [node for node, data in G.nodes(data=True) if data.get('Time Step') == 29]
G_29 = G.subgraph(desired_nodes)

We will use the subgraph of the time step 29 to make our first analysis, since it is a time step with a good amount of both licit and ilicit transactions.

In [63]:
# get the order (number of nodes) of the graph
order = nx.number_of_nodes(sub_G_29)

# get the size (number of edges) of the graph
size = nx.number_of_edges(sub_G_29)

# get the density of the graph
density = nx.density(sub_G_29)

# get the average degree of the nodes in the graph
avg_degree = sum(dict(sub_G_29.degree()).values()) / order

# print the results
print(f"Order: {order}")
print(f"Size: {size}")
print(f"Density: {round(density,3)}")
print(f"Average degree: {round(avg_degree,3)}")

Order: 4275
Size: 4541
Density: 0.0
Average degree: 2.124


In [None]:
for node, data in G.nodes(data=True):
    print(node, data)
    

In [43]:
is_connected = nx.is_connected(sub_G_29)   #This function checks that the graph is connected (that all nodes are reachable from any other node)

if is_connected:
    print("The graph is connected.")
else:
    print("The graph is not connected.")

The graph is connected.


In [39]:
diameter = nx.diameter(sub_G_29)
print(f"Diameter: {diameter}")

radius = nx.radius(sub_G_29)
print(f"Radius: {radius}")

avg_distance = nx.average_shortest_path_length(sub_G_29)
print(f"Average network distance: {round(avg_distance,3)}")

clustering_coefficient = nx.average_clustering(sub_G_29)
print(f"Clustering coefficient: {round(clustering_coefficient,3)}")

Diameter: 110
Radius: 55
Average network distance: 14.805
Clustering coefficient: 0.006


In [123]:
ilicit_nodes = [node for node, data in sub_G_29.nodes(data=True) if data['class'] == '1']
licit_nodes = [node for node, data in sub_G_29.nodes(data=True) if data['class'] == '2']
unknown_nodes = [node for node, data in sub_G_29.nodes(data=True) if data['class'] == 3]
print("ilicit:",len(ilicit_nodes))
print("licit:",len(licit_nodes))
print("unknown:",len(unknown_nodes))

ilicit: 329
licit: 845
unknown: 3101


In [139]:
sub_G_29[163815428]

AtlasView(FilterAtlas({166499589: {}, 163815413: {}}, <function FilterAdjacency.__getitem__.<locals>.new_node_ok at 0x000001252F0E0EA0>))

In [164]:
n = [node for node in licit_nodes if len(sub_G_29[node]) == len([neigh for neigh in sub_G_29[node] if sub_G_29.nodes[neigh]['class'] in ['1',3]])] 

In [165]:
xd =[]
for elem in n:
    xd.append(len(sub_G_29[elem]))

print(xd)

[1, 6, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 34, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 18, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
group_class = df_classes.groupby('class').count()
plt.barh(['Unknown', 'Ilicit', 'Licit'], group_class['txId'].values, color=['orange', 'r', 'g'] )
plt.title("Distribution of labels")
plt.ylabel("Labels")
plt.xlabel("Instances")
plt.show()

In [55]:
degree_centrality = nx.degree_centrality(sub_G_29) # Degree Centrality
betweenness_centrality = nx.betweenness_centrality(sub_G_29) # Betweenness Centrality
closeness_centrality = nx.closeness_centrality(sub_G_29) # Closeness Centrality
eigen_centrality = nx.eigenvector_centrality(sub_G_29) # Eigenvalue Centrality
#page_rank = nx.pagerank(sub_G_29) # PageRank Score
#eccentricity = nx.eccentricity(sub_G_29) # Eccentricity
#cflow_b_centrality = nx.current_flow_betweenness_centrality(sub_G_29) # Current flow betweenness centrality
#secondord_centrality = nx.second_order_centrality(sub_G_29) # Second order centrality
#laplacian_centrality = nx.laplacian_centrality(sub_G_29) # Laplacian centrality

KeyboardInterrupt: 

In [None]:
degree = dict(sub_G_29.nodes(data=True)) # Nodes' degree
for node in sub_G_29:
    nx.set_node_attributes(sub_G_29, nx.all_neighbors(sub_G_29, node), 'neighbours')

In [None]:
node_data 
df = pd.DataFrame.from_dict(node_data, orient='index')
df.head()

In [None]:
degrees = dict(sub_G_29.degree())
nx.set_node_attributes(G, degrees, 'degree')

## Others

In [None]:
nx.write_graphml_lxml(sub_G_29, "subgraph29.graphml")

In [None]:
nx.draw_networkx(sub_G_29, pos=nx.spring_layout(sub_G_29), with_labels=False, node_size=10)