# Graph Anomaly Detection


### Processing and analyzing training data

## Load data

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle as pkl
import time

In [2]:
# Read files
path = "C:/Users/marti/Desktop/WAP/6e_semestre/SPII/GraphAnomaly/dades_marti/"
df_classes = pd.read_csv(path + "elliptic_txs_classes.csv") # Nodes' labels
df_edges = pd.read_csv(path + "elliptic_txs_edgelist.csv") # Edges
df_features = pd.read_csv(path + "elliptic_txs_features.csv", header=None) # Nodes' features

In [3]:
# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

df_features = df_features.rename(columns=colNames)

In [4]:
# Pass unknown to number 3
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 3
print('Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.\n')
print('Shape of classes', df_classes.shape)
print('Shape of edges', df_edges.shape)
print('Shape of features', df_features.shape)

Label 1 belongs to illicit transactions, label 2 to licit transactions and label 3 to unknown transactions.

Shape of classes (203769, 2)
Shape of edges (234355, 2)
Shape of features (203769, 167)


## Data visualization

In [5]:
df_classes.groupby(['class']).count()

Unnamed: 0_level_0,txId
class,Unnamed: 1_level_1
3,157205
1,4545
2,42019


In [6]:
df_features.shape,df_classes.shape

((203769, 167), (203769, 2))

In [7]:
# Merge the DataFrames on the column 'source', assuming it's the same name in both DataFrames
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_merged.head()

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,3
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,3
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,3


In [8]:
generate_graph = True
if generate_graph:
    # Create an empty graph
    G = nx.Graph()

    for _, row in df_features.iterrows():
        # Extract node ID and attributes
        node_id = row['txId']
        node_attributes = row.drop('txId').to_dict()
        # Add node to the graph with its attributes
        G.add_node(node_id, **node_attributes)

    # Add edges to the graph
    for _, row in df_edges.iterrows():
        G.add_edge(row['txId1'], row['txId2'])

In [9]:

# # Save the graph as a pickle file
with open("./dades_marti/elipticData_graph.pkl", "wb") as f:
    pkl.dump(G, f)


# Specify the path to your pickle file
pickle_file_path = path + 'elipticData_graph.pkl'

# Open the pickle file in binary mode
with open(pickle_file_path, 'rb') as f:
    # Load the data from the pickle file
    G = pkl.load(f)


In [10]:
# Get the number of nodes
num_nodes = nx.number_of_nodes(G)

# Get the number of edges
num_edges = nx.number_of_edges(G)

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 203769
Number of edges: 234355


Creating subgraphs for each time step

In [11]:
# Specify the feature name and the desired feature value
time_step = 'Time step'

for value in  range(max(df_features['Time step'])):
    # Create a list of nodes that have the desired value in the specified feature
    desired_nodes = [node for node, data in G.nodes(data=True) if data.get(time_step) == value+1]
    sub_G = G.subgraph(desired_nodes)

    num_nodes = nx.number_of_nodes(sub_G)

    # Get the number of edges
    num_edges = nx.number_of_edges(sub_G)


### Adding common metrics as features

In [58]:
# Això triga com 10 minuts a correr!!!
computar = True
if computar:
    d_grau = dict(G.degree())
    graus = [grau for grau in d_grau.keys()]
    print("Getting degrees - done!")
    degree_centralities = [dc for dc in nx.degree_centrality(G).keys()]
    print("Getting degree centrality - done!")
    betweenness_centralities = [bc for bc in nx.betweenness_centrality(G).keys()]
    print("Getting betweenness centrality - done!")
    eigenvector_centralities = [ec for ec in nx.eigenvector_centrality(G).keys()]
    print("Getting eigenvector centrality - done!")
    closeness_centralities = [cc for cc in nx.closeness_centrality(G).keys()]
    print("Getting closeness centrality - done!")
    clustering_coefficients = [cc for cc in nx.clustering(G).keys()]
    print("Getting the clustering coefficient - done!")

    df_extended = df_merged.copy()
    df_extended['degree'] = graus
    df_extended['degree_centrality'] = degree_centralities
    df_extended['betweenness_centrality'] = betweenness_centralities
    df_extended['eigenvector_centrality'] = eigenvector_centralities
    df_extended['closeness_centrality'] = closeness_centralities
    df_extended['clustering_coefficient'] = clustering_coefficients
    print("All done!")

    # Modify this to your 
    df_extended.to_csv('./dades_marti/extended.csv')

else:
    df_extended = pd.read_csv('./dades_marti/extended.csv')

KeyboardInterrupt: 

### Logistic regression using sklearn

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

In [None]:
# Standardize the features (important for PCA)
scaler = StandardScaler()

df_pca = df_extended.drop(columns=['txid', 'timestep', 'class'])
scaled_data = scaler.fit_transform(df_pca)

# Apply PCA
pca = PCA(n_components=100)  # You can choose the number of components you want to keep
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame for the principal components
columns = [f"PC{i+1}" for i in range(principal_components.shape[1])]
principal_df = pd.DataFrame(data=principal_components, columns=columns)


explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = explained_variance_ratio.sum()

print(f"Explained variance ratio: {cumulative_variance_ratio}")
print(f"Data reduction, from shape {df_pca.shape} to {principal_df.shape}")


Explained variance ratio: 0.9985460497241709
Data reduction, from shape (203769, 166) to (203769, 100)


### Trying node embeddings with node2vec

In [12]:
# from node2vec import Node2Vec



In [21]:
G.nodes[]

KeyError: 0