# Graph Anomaly Detection


### Processing and analyzing training data

## Load data

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle as pkl
import time

In [2]:
# Read files
path = "C:/Users/marti/Desktop/WAP/6e_semestre/SPII/GraphAnomaly/dades_marti/"
df_classes = pd.read_csv(path + "elliptic_txs_classes.csv") # Nodes' labels
df_edges = pd.read_csv(path + "elliptic_txs_edgelist.csv") # Edges
df_features = pd.read_csv(path + "elliptic_txs_features.csv", header=None) # Nodes' features

In [3]:
# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

df_features = df_features.rename(columns=colNames)

In [4]:
# Pass unknown to number 3
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 3
print('Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.\n')
print('Shape of classes', df_classes.shape)
print('Shape of edges', df_edges.shape)
print('Shape of features', df_features.shape)

Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.

Shape of classes (203769, 2)
Shape of edges (234355, 2)
Shape of features (203769, 167)


## Data visualization

In [5]:
df_classes.groupby(['class']).count()

Unnamed: 0_level_0,txId
class,Unnamed: 1_level_1
3,157205
1,4545
2,42019


In [6]:
df_features.shape,df_classes.shape

((203769, 167), (203769, 2))

In [7]:
# Merge the DataFrames on the column 'source', assuming it's the same name in both DataFrames
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_merged.head()

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,3
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,3


In [8]:
df_merged['class'] = df_merged['class'].astype(int)
df_labeled = df_merged.copy()
df_labeled = df_labeled[df_labeled['class'] < 3]
df_edges_labeled = df_edges[df_edges['txId1'].isin(df_labeled['txId'])]
df_edges_labeled = df_edges_labeled[df_edges_labeled['txId2'].isin(df_labeled['txId'])]
len(df_labeled)

46564

In [9]:
generate_graph = False
if generate_graph:
    # Create an empty graph
    G = nx.Graph()

    for _, row in df_features.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        # Add node to the graph with its attributes
        G.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in df_edges.iterrows():
        G.add_edge(row['txId1'], row['txId2'])

    # Save the graph as a pickle file
    with open("./dades_marti/elipticData_graph.pkl", "wb") as f:
        pkl.dump(G, f)

generate_labeled_graph = False
if generate_labeled_graph:
    # Create an empty graph
    Glab = nx.Graph()

    for _, row in df_labeled.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        # Add node to the graph with its attributes
        Glab.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in df_edges_labeled.iterrows():
        Glab.add_edge(row['txId1'], row['txId2'])
    
    # Save the labeled graph as a pickle file
    with open("./dades_marti/elipticData_graph_lab.pkl", "wb") as f:
        pkl.dump(Glab, f)
    


In [10]:
# Specify the path to your pickle file
pickle_file_path = path + 'elipticData_graph.pkl'

# Open the pickle file in binary mode
with open(pickle_file_path, 'rb') as f:
    # Load the data from the pickle file
    G = pkl.load(f)


# Specify the path to your pickle file
pickle_file_path = path + 'elipticData_graph_lab.pkl'

# Open the pickle file in binary mode
with open(pickle_file_path, 'rb') as f:
    # Load the data from the pickle file
    Glab = pkl.load(f)


In [11]:
# Get the number of nodes
num_nodes = nx.number_of_nodes(Glab)

# Get the number of edges
num_edges = nx.number_of_edges(Glab)

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 46564
Number of edges: 36624


### Adding common metrics as features

In [12]:
#Implementation of the betweenness centrality by NetworkX

"""
====================
Parallel Betweenness
====================

Example of parallel implementation of betweenness centrality using the
multiprocessing module from Python Standard Library.

The function betweenness centrality accepts a bunch of nodes and computes
the contribution of those nodes to the betweenness centrality of the whole
network. Here we divide the network in chunks of nodes and we compute their
contribution to the betweenness centrality of the whole network.

Note: The example output below shows that the non-parallel implementation is
faster. This is a limitation of our CI/CD pipeline running on a single core.

Depending on your setup, you will likely observe a speedup.
"""
from multiprocessing import Pool
import time
import itertools

import matplotlib.pyplot as plt
import networkx as nx


def chunks(l, n):
    """Divide a list of nodes `l` in `n` chunks"""
    l_c = iter(l)
    while 1:
        x = tuple(itertools.islice(l_c, n))
        if not x:
            return
        yield x


def betweenness_centrality_parallel(G, processes=None):
    """Parallel betweenness centrality  function"""
    p = Pool(processes=processes)
    node_divisor = len(p._pool) * 4
    node_chunks = list(chunks(G.nodes(), G.order() // node_divisor))
    num_chunks = len(node_chunks)
    bt_sc = p.starmap(
        nx.betweenness_centrality_subset,
        zip(
            [G] * num_chunks,
            node_chunks,
            [list(G)] * num_chunks,
            [True] * num_chunks,
            [None] * num_chunks,
        ),
    )

    # Reduce the partial solutions
    bt_c = bt_sc[0]
    for bt in bt_sc[1:]:
        for n in bt:
            bt_c[n] += bt[n]
    return bt_c


In [13]:
# Això triga com 1000 minuts a correr!!!
computar = False
if computar:
    d_grau = dict(Glab.degree())
    graus = [grau for grau in d_grau.keys()]
    print("Getting degrees - done!")
    degree_centralities = [dc for dc in nx.degree_centrality(Glab).keys()]
    print("Getting degree centrality - done!")
    betweenness_centralities = [bc for bc in betweenness_centrality_parallel(Glab).keys()]
    print("Getting betweenness centrality - done!")
    eigenvector_centralities = [ec for ec in nx.eigenvector_centrality(Glab).keys()]
    print("Getting eigenvector centrality - done!")
    closeness_centralities = [cc for cc in nx.closeness_centrality(Glab).keys()]
    print("Getting closeness centrality - done!")
    clustering_coefficients = [cc for cc in nx.clustering(Glab).keys()]
    print("Getting the clustering coefficient - done!")

    df_extended = df_labeled.copy()
    df_extended['degree'] = graus
    df_extended['degree_centrality'] = degree_centralities
    df_extended['betweenness_centrality'] = betweenness_centralities
    df_extended['eigenvector_centrality'] = eigenvector_centralities
    df_extended['closeness_centrality'] = closeness_centralities
    df_extended['clustering_coefficient'] = clustering_coefficients
    print("All done!")

    # Modify this to your 
    df_extended.to_csv('./dades_marti/labeled_extended.csv')

else:
    df_extended = pd.read_csv('./dades_marti/labeled_extended.csv')

### Logistic regression using sklearn

In [14]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

In [15]:
len(df_extended)

46564

In [16]:
# Standardize the features (important for PCA)
scaler = StandardScaler()

# df_pca = df_extended.drop(columns=['txId', 'Time step', 'class'])
df_pca = df_extended.drop(columns=['txId',  'class'])
scaled_data = scaler.fit_transform(df_pca)

# Apply PCA
pca = PCA(n_components=69)  # You can choose the number of components you want to keep
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame for the principal components
columns = [f"PC{i+1}" for i in range(principal_components.shape[1])]
principal_df = pd.DataFrame(data=principal_components, columns=columns)


explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = explained_variance_ratio.sum()

print(f"Explained variance ratio: {cumulative_variance_ratio}")
print(f"Data reduction, from shape {df_pca.shape} to {principal_df.shape}")


Explained variance ratio: 0.9806481763024362
Data reduction, from shape (46564, 173) to (46564, 69)


Logistic regression

In [17]:
#afegir dues columnes per poder aplicar logistic regression
principal_df['txId'] = df_pca.index
principal_df['class'] = [1 if classe == 1 else 0 for classe in df_extended['class']]

X = principal_df.drop(columns=['txId', 'class'])
y = principal_df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
precision = precision_score(y_test, y_pred)
print("Precision: {:.2f}%".format(precision * 100))
recall = recall_score(y_test, y_pred)
print("Recall: {:.2f}%".format(recall * 100))
f1score = f1_score(y_test, y_pred)
print("F1 Score: {:.2f}%".format(f1score * 100))

Accuracy: 95.33%
Precision: 81.44%
Recall: 66.85%
F1 Score: 73.43%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Trying node embeddings with node2vec

In [18]:
# from node2vec import Node2Vec
# import tqdm as notebook_tqdm

# node2vec = Node2Vec(Glab, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
# model = node2vec.fit(window=10, min_count=1, batch_words=4)

# model.save('dades_marti/node2vec_labelled.bin')

Node Embeddings with DGL

In [25]:
import dgl
import torch

GlabDGL = dgl.from_networkx(Glab)

nodeId = torch.Tensor(0, dtype=torch.int64)
p = 1
q = 1
walk_length = 20
traces = dgl.sampling.node2vec_random_walk(GlabDGL, nodeId, p, q, walk_length)


TypeError: new() received an invalid combination of arguments - got (int, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)
