# Data Processing


## Common Operations

In [224]:
# Import packages
import pandas as pd
import networkx as nx
import pickle

### Load data

In [225]:
# Read files
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

In [226]:
# Data labels
df_classes = pd.read_csv(path + 'elliptic_txs_classes.csv') 

# Set classes' values 
df_classes = df_classes.rename(columns={'txId': 'node'})
df_classes.loc[df_classes['class'] == '1', 'class'] = 0 # Ilicit
df_classes.loc[df_classes['class'] == '2', 'class'] = 1 # Licit
df_classes.loc[df_classes['class'] == 'unknown', 'class'] = 2 # Unknown

# Show information
print('Shape of classes', df_classes.shape)

Shape of classes (203769, 2)


In [227]:
# Edges
df_edges = pd.read_csv(path + 'elliptic_txs_edgelist.csv') # Edges
df_edges = df_edges.rename(columns={'txId1':'node1', 'txId2':'node2'})

# Show information
print('Shape of edges', df_edges.shape)

Shape of edges (234355, 2)


In [228]:
# Data features
df_features = pd.read_csv(path + 'elliptic_txs_features.csv', header=None)

# Set the names of the features
colNames1 = {'0': 'node', 1: 'time step'}
colNames2 = {str(ii+2): 'local_feature_' + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): 'aggregate_feature_' + str(ii+1) for ii in range(72)}
colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}
df_features = df_features.rename(columns=colNames)

# Show information
print('Shape of features', df_features.shape)

Shape of features (203769, 167)


In [229]:
# Merge class and features
df_class_feature = pd.merge(df_classes, df_features)

### Visualize

In [230]:
df_class_feature

Unnamed: 0,node,class,time step,local_feature_1,local_feature_2,local_feature_3,local_feature_4,local_feature_5,local_feature_6,local_feature_7,...,aggregate_feature_63,aggregate_feature_64,aggregate_feature_65,aggregate_feature_66,aggregate_feature_67,aggregate_feature_68,aggregate_feature_69,aggregate_feature_70,aggregate_feature_71,aggregate_feature_72
0,230425980,2,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,2,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,2,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,2,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,2,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203765,158577750,2,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203766,158375402,0,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203767,158654197,2,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


In [231]:
df_edges

Unnamed: 0,node1,node2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206
...,...,...
234350,158365409,157930723
234351,188708874,188708879
234352,157659064,157659046
234353,87414554,106877725


## Splits Management

In [232]:
# Separate dataframe by time steps into a dictionary
df_dict = {int(sale_v): df_class_feature[df_class_feature['time step'] == sale_v] for sale_v in df_class_feature['time step'].unique()}

### Balanced split

In [233]:
# Function to split the dataset for a given size
def balanced_split(df_dict, TSIZE):
    # Initialize variables
    ilicit_count = []
    length = 0
    ts = 0    
    train_df = pd.DataFrame()      
    test_df = pd.DataFrame()    

    # Count the amount of ilicit nodes of each time step
    for key in df_dict.keys():
        temp = df_dict[key].groupby('class').count()    
        temp = temp['node'].reset_index()
        ilicit_count.append([key, temp[temp['class'] == 0]['node'][0]])
    ilicit_count.sort(key = lambda row: row[1], reverse=True)         

    # Perform split
    while length < (TSIZE):
        if (ts % 2) == 0:            
            test_df = pd.concat([test_df, df_dict[ilicit_count[ts][0]]], ignore_index=True)            
            length+=df_dict[ilicit_count[ts][0]].shape[0]
            ts+=1
        
        else:
            train_df = pd.concat([train_df, df_dict[ilicit_count[ts][0]]], ignore_index=True)                  
            ts+=1

    while ts < 49:
        train_df = pd.concat([train_df, df_dict[ilicit_count[ts][0]]], ignore_index=True)                
        ts+=1
    
    return train_df, test_df

In [234]:
TEST_SIZE = 0.2
balanced_train, balanced_test = balanced_split(df_dict, df_class_feature.shape[0] * TEST_SIZE)

#### Visualize

In [235]:
balanced_train['time step'].unique()

array([29, 20, 42, 22, 24, 16, 41, 38,  7, 17, 28, 30, 39, 19,  8, 49, 23,
       18, 14, 37, 34, 48, 36,  4, 27, 43, 44, 33, 47,  2, 10,  1, 12,  3,
        5,  6, 45, 46], dtype=int64)

In [236]:
temp = balanced_train.groupby('class').count()
temp = temp['node'].reset_index().rename(columns={'node': 'count'})
temp

Unnamed: 0,class,count
0,0,2672
1,1,34654
2,2,123281


In [237]:
balanced_test['time step'].unique()

array([32, 13,  9, 35, 15, 11, 25, 40, 31, 21, 26], dtype=int64)

In [238]:
temp = balanced_test.groupby('class').count()
temp = temp['node'].reset_index().rename(columns={'node': 'count'})
temp

Unnamed: 0,class,count
0,0,1873
1,1,7365
2,2,33924


### Sequential split

In [239]:
def sequential_split(df_dict, VSIZE, TSIZE):
    # Initialize variables    
    length = 0
    ts = 49    
    train_df = pd.DataFrame() 
    validation_df = pd.DataFrame()     
    test_df = pd.DataFrame()     

    # Perform split
    while length < (TSIZE):
        test_df = pd.concat([test_df, df_dict[ts]], ignore_index=True)            
        length+=df_dict[ts].shape[0]
        ts-=1    
    
    while length < (VSIZE + TSIZE):
        validation_df = pd.concat([validation_df, df_dict[ts]], ignore_index=True)            
        length+=df_dict[ts].shape[0]
        ts-=1
    
    while ts != 0:
        train_df = pd.concat([train_df, df_dict[ts]], ignore_index=True)        
        ts-=1    
    
    return train_df[::-1].reset_index(drop=True), validation_df[::-1].reset_index(drop=True), test_df[::-1].reset_index(drop=True)

In [240]:
VALIDATION_SIZE = 0.2
TEST_SIZE = 0.2
sequential_train, sequential_validation, sequential_test = sequential_split(df_dict, df_class_feature.shape[0] * VALIDATION_SIZE, df_class_feature.shape[0] * TEST_SIZE)

#### Visualize

In [241]:
sequential_train['time step'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], dtype=int64)

In [242]:
temp = sequential_train.groupby('class').count()
temp = temp['node'].reset_index().rename(columns={'node': 'count'})
temp

Unnamed: 0,class,count
0,0,2871
1,1,23510
2,2,94423


In [243]:
sequential_validation['time step'].unique()

array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], dtype=int64)

In [244]:
temp = sequential_validation.groupby('class').count()
temp = temp['node'].reset_index().rename(columns={'node': 'count'})
temp

Unnamed: 0,class,count
0,0,1150
1,1,9060
2,2,30589


In [245]:
sequential_test['time step'].unique()

array([41, 42, 43, 44, 45, 46, 47, 48, 49], dtype=int64)

In [246]:
temp = sequential_test.groupby('class').count()
temp = temp['node'].reset_index().rename(columns={'node': 'count'})
temp

Unnamed: 0,class,count
0,0,524
1,1,9449
2,2,32193


## Save and Load

In [247]:
# Save files
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

### Dataframes

In [248]:
# Save the whole dataset
with open(path + 'nodes.pkl', 'wb') as f:
    pickle.dump(df_class_feature, f)
with open(path + 'edges.pkl', 'wb') as f:
    pickle.dump(df_edges, f)

In [249]:
# Save the dictionary of timesteps
with open(path + 'timesteps.pkl', 'wb') as f:
    pickle.dump(df_dict, f)

In [251]:
# Save the balanced split
with open(path + 'balanced_train.pkl', 'wb') as f:
    pickle.dump(balanced_train, f)
with open(path + 'balanced_test.pkl', 'wb') as f:
    pickle.dump(balanced_test, f)

In [252]:
# Save the sequential split
with open(path + 'sequential_train.pkl', 'wb') as f:
    pickle.dump(sequential_train, f)
with open(path + 'sequential_validation.pkl', 'wb') as f:
    pickle.dump(sequential_validation, f)
with open(path + 'sequential_test.pkl', 'wb') as f:
    pickle.dump(sequential_test, f)

### Graphs

In [253]:
def create_graph(nodes, name='sample.graphml'):
    # Initialize variables
    G = nx.Graph()        

    # Add nodes
    for _, row in nodes.iterrows(): # Use any of the previous dataframes        
        node_id = row['node']
        node_attributes = row.drop('node').to_dict()        
        G.add_node(node_id, **node_attributes)

    # Add edges
    for _, row in df_edges.iterrows():
        if row['node1'] in nodes['node'].unique():
            G.add_edge(row['node1'], row['node2'])

    # Save the graph as a graphml file    
    nx.write_graphml_lxml(G, path + name)   