# Data Processing


## Common Operations

### Load data

In [1]:
# Read files
import pandas as pd
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

In [2]:
# Data labels
df_classes = pd.read_csv(path + 'elliptic_txs_classes.csv') 

# Set classes' values 
df_classes = df_classes.rename(columns={'txId': 'node'})
df_classes.loc[df_classes['class'] == '1', 'class'] = 0 # Ilicit
df_classes.loc[df_classes['class'] == '2', 'class'] = 1 # Licit
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 2 # Unknown

# Show information
print('Shape of classes', df_classes.shape)

Shape of classes (203769, 2)


In [3]:
# Edges
df_edges = pd.read_csv(path + 'elliptic_txs_edgelist.csv') # Edges
df_edges = df_edges.rename(columns={'txId1':'node1', 'txId2':'node2'})

# Show information
print('Shape of edges', df_edges.shape)

Shape of edges (234355, 2)


In [4]:
# Data features
df_features = pd.read_csv(path + 'elliptic_txs_features.csv', header=None)

# Set the names of the features
colNames1 = {'0': 'node', 1: 'time step'}
colNames2 = {str(ii+2): 'local_feature_' + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): 'aggregate_feature_' + str(ii+1) for ii in range(72)}
colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}
df_features = df_features.rename(columns=colNames)

# Show information
print('Shape of features', df_features.shape)

Shape of features (203769, 167)


In [5]:
# Merge class and features
df_class_feature = pd.merge(df_classes, df_features)

### Visualize

In [6]:
df_class_feature

Unnamed: 0,node,class,time step,local_feature_1,local_feature_2,local_feature_3,local_feature_4,local_feature_5,local_feature_6,local_feature_7,...,aggregate_feature_63,aggregate_feature_64,aggregate_feature_65,aggregate_feature_66,aggregate_feature_67,aggregate_feature_68,aggregate_feature_69,aggregate_feature_70,aggregate_feature_71,aggregate_feature_72
0,230425980,2,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,2,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,2,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,2,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,2,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203765,158577750,2,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203766,158375402,0,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203767,158654197,2,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


In [7]:
df_edges

Unnamed: 0,node1,node2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206
...,...,...
234350,158365409,157930723
234351,188708874,188708879
234352,157659064,157659046
234353,87414554,106877725


## Split data

In [8]:
# Separate dataframe by time steps into a dictionary
df_dict = {int(sale_v): df_class_feature[df_class_feature['time step'] == sale_v] for sale_v in df_class_feature['time step'].unique()}

In [65]:
from Helper import merge_timesteps

# Sequential version
section = [1, 35, 49] # Starting node, split node, ending node
train_seq, test_seq = merge_timesteps(df_dict, 'sequential', section)

# Balanced version
section = [35, 14] # Amount of nodes in train set, amount of nodes in test set
train_bal, test_bal = merge_timesteps(df_dict, 'balanced', section)

#### Visualize

In [73]:
def visualize(df):
    print(df['time step'].unique())
    temp = df.groupby('class').count()
    temp = temp['node'].reset_index().rename(columns={'node': 'count'})
    print(temp)    

In [74]:
visualize(train_seq)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35]
   class   count
0      0    3644
1      1   27591
2      2  110537


In [75]:
visualize(test_seq)

[36 37 38 39 40 41 42 43 44 45 46 47 48 49]
   class  count
0      0    901
1      1  14428
2      2  46668


In [76]:
visualize(train_bal)

[32 13  9 35 15 11 25 40 31 21 26 30 19 49 18 37 48  4 43 33  2  1  3  6
 46 14 34 36 27 44 47 10 12  5 45]
   class   count
0      0    2580
1      1   28616
2      2  112705


In [77]:
visualize(test_bal)

[29 20 42 22 24 16 41 38  7 17 28 39  8 23]
   class  count
0      0   1965
1      1  13403
2      2  44500


## Save and Load

In [78]:
# Save files
import pickle
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

### Dataframes

In [248]:
# Save the whole dataset
with open(path + 'nodes.pkl', 'wb') as f:
    pickle.dump(df_class_feature, f)
with open(path + 'edges.pkl', 'wb') as f:
    pickle.dump(df_edges, f)

In [249]:
# Save the dictionary of timesteps
with open(path + 'timesteps.pkl', 'wb') as f:
    pickle.dump(df_dict, f)

In [16]:
# Save the sequential split
with open(path + 'sequential_train.pkl', 'wb') as f:
    pickle.dump(train_seq, f)
with open(path + 'sequential_test.pkl', 'wb') as f:
    pickle.dump(test_seq, f)

### Graphs

In [17]:
import networkx as nx

# Function to create a graph from any dataframe of nodes
def create_graph(nodes, name='sample.graphml'):
    # Initialize variables
    G = nx.Graph()        

    # Add nodes
    for _, row in nodes.iterrows(): # Use any of the previous dataframes        
        node_id = row['node']
        node_attributes = row.drop('node').to_dict()        
        G.add_node(node_id, **node_attributes)

    # Add edges
    for _, row in df_edges.iterrows():
        if row['node1'] in nodes['node'].unique():
            G.add_edge(row['node1'], row['node2'])

    # Save the graph as a graphml file    
    nx.write_graphml_lxml(G, path + name)   