In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import warnings
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import torch
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.utils import from_networkx
warnings.filterwarnings("ignore")

In [3]:
complete_dataset = pd.read_csv('all_cleaned_data.csv')
complete_dataset.head()

Unnamed: 0,from,to,contract_address,cumulativeGasUsed,gasUsed,Date,Value_usd,Transaction_cost_usd,isError,isinternal,...,Date_born,age_days,age_minutes,user,cluster_0.0,cluster_1.0,cluster_2.0,cluster_3.0,cluster_4.0,cluster_5.0
0,0xd2e6b3bfe990fdede2380885d9d83ca9364e717e,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,7123739.0,1515366,2018-04-10 17:55:17,0.0,1.945962,0,0,...,2018-04-10 17:55:17,0,0.0,0xd2e6b3bfe990fdede2380885d9d83ca9364e717e,0,0,0,0,1,0
1,0x20c945800de43394f70d789874a4dac9cfa57451,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,3069896.0,21000,2018-04-10 18:14:01,2.858277e-14,0.008699,1,0,...,2018-04-10 17:55:17,0,18.733333,0x20c945800de43394f70d789874a4dac9cfa57451,0,0,0,0,1,0
2,0x0668dea6b5ec94d7ce3c43fe477888eee2fc1b2c,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,243354.0,21110,2018-04-10 18:39:40,1.739821e-13,0.008745,1,0,...,2018-04-10 17:55:17,0,44.383333,0x0668dea6b5ec94d7ce3c43fe477888eee2fc1b2c,0,0,0,0,1,0
3,0xd2e6b3bfe990fdede2380885d9d83ca9364e717e,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,179688.0,127688,2018-04-10 21:17:03,124.2729,0.42844,0,0,...,2018-04-10 17:55:17,0,201.766667,0xd2e6b3bfe990fdede2380885d9d83ca9364e717e,0,0,0,0,1,0
4,0xc951d3463ebba4e9ec8ddfe1f42bc5895c46ec8f,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,0x00efd61b0d94ccd82f3922d26efdd3ed9859081a,3752942.0,27046,2018-04-10 21:18:04,1367.002,0.090525,1,0,...,2018-04-10 17:55:17,0,202.783333,0xc951d3463ebba4e9ec8ddfe1f42bc5895c46ec8f,0,0,0,0,1,0


### Required Data Preparation Functions

In [4]:
def transaction_values_sent(df):
  return df.groupby('from')[['Value_usd']].agg(['count', 'sum', 'mean', 'std', 'median', 'max', 'min']).reset_index().fillna(0).rename({'from':'identifier'}, axis=1)

def transaction_values_received(df):
  return df.groupby('to')[['Value_usd']].agg(['count', 'sum', 'mean', 'std', 'median', 'max', 'min']).reset_index().fillna(0).rename({'to':'identifier'}, axis=1)

def errors_sent(df):
  return df.groupby('from')[['isError']].agg(['sum', 'mean', 'std', 'median']).reset_index().fillna(0).rename({'from':'identifier'}, axis=1)

def errors_received(df):
  return df.groupby('to')[['isError']].agg(['sum', 'mean', 'std', 'median']).reset_index().fillna(0).rename({'to':'identifier'}, axis=1)

def lifetimes_sent(df):
  return df.groupby('from')[['age_minutes']].agg(['mean', 'std', 'median']).reset_index().fillna(0).rename({'from':'identifier'}, axis=1)

def lifetimes_received(df):
  return df.groupby('to')[['age_minutes']].agg(['mean', 'std', 'median']).reset_index().fillna(0).rename({'to':'identifier'}, axis=1)

def transaction_costs_sent(df):
  return df.groupby('from')[['Transaction_cost_usd']].agg(['sum', 'mean', 'std', 'median', 'max', 'min']).reset_index().fillna(0).rename({'from':'identifier'}, axis=1)
def degree_calculator(df):
    return df.shape[0]

def number_transacts_user(df):
    if df.shape[0]==0:
        return 0,0,0,0,0,0,0
    td = df.groupby('user')[['to']].count().reset_index()
    return td['to'].mean(), td['to'].std(), td['to'].median(),  td['to'].min(), np.quantile(td['to'], q=0.25), np.quantile(td['to'], q=0.75), td['to'].max()

def errors_calculator(df):
    if df.shape[0]==0:
        return 0,0,0,0,0,0
    return df['isError'].sum(), df['isError'].mean(), df['isError'].std(), df['isError'].median(), np.quantile(df['isError'], q=0.25), np.quantile(df['isError'], q=0.75)

def distinct_users(df):
    if df.shape[0]==0:
        return 0
    return df['user'].nunique()

def transaction_value(df):
    if df.shape[0]==0:
        return 0,0,0,0,0,0,0,0
    return df['Value_usd'].sum(), df['Value_usd'].mean(), df['Value_usd'].std(), df['Value_usd'].median(), df['Value_usd'].min(), np.quantile(df['Value_usd'], q=0.25), np.quantile(df['Value_usd'], q=0.75), df['Value_usd'].max()

def lifetime_calculator(df):
    return df['age_minutes'].mean(), df['age_minutes'].std(), df['age_minutes'].median(), np.quantile(df['age_minutes'], q=0.25), np.quantile(df['age_minutes'], q=0.75), df['age_minutes'].max()

def sign_calculator(row):
    if row['from']==row['user']:
        return -row['Value_usd']
    else:
        return row['Value_usd']

def inflows_from_users(df):
    td = df[df['value_correct_sign']<0]
    if td.shape[0]==0:
        return 0,0,0,0,0,0,0
    td = td.groupby('user')[['value_correct_sign']].sum().reset_index()
    return td['value_correct_sign'].mean(), td['value_correct_sign'].std(), td['value_correct_sign'].median(),  td['value_correct_sign'].min(), np.quantile(td['value_correct_sign'], q=0.25), np.quantile(td['value_correct_sign'], q=0.75), td['value_correct_sign'].max()

def outflows_to_users(df):
    td = df[df['value_correct_sign']>0]
    if td.shape[0]==0:
        return 0,0,0,0,0,0,0
    td = td.groupby('user')[['value_correct_sign']].sum().reset_index()
    return td['value_correct_sign'].mean(), td['value_correct_sign'].std(), td['value_correct_sign'].median(),  td['value_correct_sign'].min(), np.quantile(td['value_correct_sign'], q=0.25), np.quantile(td['value_correct_sign'], q=0.75), td['value_correct_sign'].max()

def transaction_costs_contract(df):
    if df.shape[0]==0:
        return 0,0,0,0,0,0,0,0
    return df['Transaction_cost_usd'].sum(), df['Transaction_cost_usd'].mean(), df['Transaction_cost_usd'].std(), df['Transaction_cost_usd'].median(), df['Transaction_cost_usd'].min(),np.quantile(df['Transaction_cost_usd'], q=0.25), np.quantile(df['Transaction_cost_usd'], q=0.75), df['Transaction_cost_usd'].max()

def transaction_costs_users(df):
    if df.shape[0]==0:
        return 0,0,0,0,0,0,0
    td = df.groupby('user')[['Transaction_cost_usd']].sum().reset_index()
    return td['Transaction_cost_usd'].mean(), td['Transaction_cost_usd'].std(), td['Transaction_cost_usd'].median(),  td['Transaction_cost_usd'].min(), np.quantile(td['Transaction_cost_usd'], q=0.25), np.quantile(td['Transaction_cost_usd'], q=0.75), td['Transaction_cost_usd'].max()


In [5]:
def aggregate_edges(temp_df):
  td1 = temp_df.groupby('from')[['Value_usd']].sum().reset_index().rename({'from':'idx', 'Value_usd':'sent'}, axis=1)
  td2 = temp_df.groupby('to')[['Value_usd']].sum().reset_index().rename({'to':'idx', 'Value_usd':'received'}, axis=1)
  td2=pd.merge(td1, td2, on='idx', how='outer').fillna(0)
  td2=td2[td2['idx']!=temp_df['contract_address'].iloc[0]]
  td2['value']=td2['received']-td2['sent']
  froms = td2[td2['value']<0] #those are nodes that have only paid. There should be an edge from the node to the contract
  froms['to']=temp_df['contract_address'].iloc[0]
  froms['value']*=-1
  froms = froms.rename({'idx':'from'}, axis=1)
  tos = td2[td2['value']>=0]
  tos['from']=temp_df['contract_address'].iloc[0]
  tos['value']+=1e-10
  tos = tos.rename({'idx':'to'}, axis=1)
  td = pd.concat((froms, tos)).reset_index(drop=True)
  return td

In [6]:
complete_dataset=complete_dataset.fillna(0)

def create_data(step):
  graphs_data = []
  labels_data = []
  graph_idx = []
  for contract in list(set(complete_dataset['contract_address'])):
      temp_df = complete_dataset[complete_dataset['contract_address']==contract].reset_index(drop=True).head(10*step)
      if temp_df.shape[0]>=10*step:
        G=nx.DiGraph()
        #nx.set_node_attributes(G, {node:{x:[attributes]}})
        #adding node features
        node_attributes = pd.merge(transaction_values_sent(temp_df), transaction_values_received(temp_df), on='identifier', how='outer')
        node_attributes['account_balance']=node_attributes.loc[:, ('Value_usd_x', 'sum')] - node_attributes.loc[:, ('Value_usd_y', 'sum')]
        node_attributes = pd.merge(node_attributes, errors_sent(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, errors_received(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, lifetimes_sent(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, lifetimes_received(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, transaction_costs_sent(temp_df), on='identifier', how='outer')
        #node_attributes['distinct_receiving_users'] = temp_df.groupby('from')[['to']].nunique()['to'].tolist()
        cluster_data = temp_df.iloc[0, -6:].tolist()
        node_attributes['cluster_0']=cluster_data[0]
        node_attributes['cluster_1']=cluster_data[1]
        node_attributes['cluster_2']=cluster_data[2]
        node_attributes['cluster_3']=cluster_data[3]
        node_attributes['cluster_4']=cluster_data[4]
        node_attributes['cluster_5']=cluster_data[5]
        node_attributes=node_attributes.fillna(0)

        aggregated_edges = aggregate_edges(temp_df)

        for j in range(aggregated_edges.shape[0]):
          #adding edges with their feaures
          from_node = aggregated_edges.loc[j, 'from']
          to_node = aggregated_edges.loc[j, 'to']
          edge_characts = aggregated_edges.loc[j, 'value']
          G.add_edge(from_node, to_node, edge_attr=edge_characts)


        for idx in node_attributes['identifier']:
          attrs = {idx: {'x':node_attributes[node_attributes['identifier']==idx].iloc[0, :].tolist()[1:]}}
          nx.set_node_attributes(G, attrs)

        temp_df['value_correct_sign'] = temp_df.apply(sign_calculator, axis=1)
        temp_df_internal = temp_df[temp_df['isinternal']==1]
        temp_df_external = temp_df[temp_df['isinternal']==0]
        graph_attributes = []
        #degrees
        contract_in_degrees=degree_calculator(temp_df_external)
        contract_out_degrees=degree_calculator(temp_df_internal)
        #number_transacts_users_indeg
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = number_transacts_user(temp_df_external)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #number_transacts_users_outdeg
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = number_transacts_user(temp_df_internal)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #errors_indeg
        total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb = errors_calculator(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb]
        #errors_outdeg
        total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb = errors_calculator(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb]
        #distinct_users
        distinct_users_indeg=distinct_users(temp_df_external)
        distinct_users_outdeg=distinct_users(temp_df_internal)
        graph_attributes+=[distinct_users_indeg, distinct_users_outdeg]
        #transaction_value_indeg
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_value(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_value_outdeg
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_value(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #age_minutes
        avg_numb, std_numb, median_numb, q1_numb, q3_numb, max_numb = lifetime_calculator(temp_df)
        graph_attributes+=[avg_numb, std_numb, median_numb, q1_numb, q3_numb, max_numb]
        #inflows_from_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = inflows_from_users(temp_df_external)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #outflows_from_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = outflows_to_users(temp_df_internal)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_costs_contract
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_costs_contract(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_costs_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_costs_users(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        G.graph.update({'contract_features':graph_attributes})

        label = temp_df.is_ponzi.iloc[0]
        graphs_data.append(G)
        labels_data.append(label)
        graph_idx.append(contract)
  return graphs_data, labels_data, graph_idx

In [7]:
class FraudDataset(InMemoryDataset):
    def __init__(self, graphs, labels, idxs):
        self.graphs = graphs
        self.labels = labels
        self.idxs = idxs

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, index):
        graph = self.graphs[index]
        label = self.labels[index]
        data = from_networkx(graph)
        data['label']=label
        data['idx']=self.idxs[index]
        return data

    def get_labels(self):
      return self.labels

In [8]:
def data_splitter(graphs_data, labels_data, graph_idx):
  train, test = train_test_split(pd.DataFrame({'label':labels_data}).reset_index(), test_size=0.25, random_state=42)
  train_data_graph = []
  train_data_label = []
  train_data_idx = []
  for i in train['index'].tolist():
    train_data_graph.append(graphs_data[i])
    train_data_label.append(labels_data[i])
    train_data_idx.append(graph_idx[i])

  test_data_graph = []
  test_data_label = []
  test_data_idx = []
  for i in test['index'].tolist():
    test_data_graph.append(graphs_data[i])
    test_data_label.append(labels_data[i])
    test_data_idx.append(graph_idx[i])

  dataset_train = FraudDataset(train_data_graph, train_data_label, train_data_idx)
  dataset_test = FraudDataset(test_data_graph, test_data_label, test_data_idx)
  train_dataloader = DataLoader(dataset_train, batch_size=500, shuffle=True)
  test_dataloader = DataLoader(dataset_test, batch_size=100, shuffle=False)
  return train_dataloader, test_dataloader

### GCN
* mean aggregation captures the distribution (or proportions) of elements
* max aggregation proves to be advantageous to identify representative elements
* sum aggregation enables the learning of structural graph properties

In [12]:
class GCN_mean(torch.nn.Module):
    def __init__(self, hidden_channels, dropout_frac=0.5):
        super(GCN_mean, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(41, hidden_channels) #41=node features
        self.lin1 = Linear(hidden_channels+80, hidden_channels)
        self.lin = Linear(hidden_channels, 2) #2=number of classes
        self.dropout_frac=dropout_frac

    def forward(self, data):
        x=data.x.to(torch.float32)
        x = self.conv1(x, data.edge_index)
        x = x.relu()
        x = global_mean_pool(x, data.batch)
        contract_feats=data.contract_features.to(torch.float32)
        contract_feats = contract_feats.reshape(len(data.label), 80)
        contract_feats[torch.isnan(contract_feats)] = 0
        x = torch.cat((x, contract_feats), dim=1)
        x =self.lin1(x)
        x = F.dropout(x, p=self.dropout_frac, training=self.training)
        x = self.lin(x)
        return F.log_softmax(x, dim=1)

def train(model, train_dataloader, device, optimizer):
    model.train()

    for data in train_dataloader:  # Iterate in batches over the training dataset.
         data = data.to(device)
         optimizer.zero_grad()  # Clear gradients.
         out = model(data)  # Perform a single forward pass.
         y_label = torch.tensor(data.label).to(device)
         #loss = F.nll_loss(out, y_label)
         loss = F.nll_loss(out, y_label)
         #loss = criterion(out, y_label)  # Compute the loss.
         #loss=criterion(torch.argmax(out, dim=1), y_label)
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.


def test(model, loader, device):
  model.eval()
  preds = []
  outs = []
  true_y=[]
  for data in loader:
      data = data.to(device)
      out = model(data)
      pred = out.argmax(dim=1)
      preds+=pred.tolist()
      true_y+=data.label
      outs+=out.tolist()
  outs=np.array(outs)
  return recall_score(true_y, preds), precision_score(true_y, preds), roc_auc_score(true_y,outs[:,1]), f1_score(true_y, preds)

In [10]:
def do_step_mean(train_dataloader, test_dataloader, step, metrics_df = pd.DataFrame()):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  drop_rates = [0.1,0.2,0.3,0.4]
  layer_dims=[16,32,64,128]
  best_f1 = 0
  best_recall = 0
  best_precision = 0
  best_auroc = 0

  for drop_rate in drop_rates:
    for layer in layer_dims:
      model_mean = GCN_mean(hidden_channels=layer, dropout_frac=drop_rate).to(device)
      optimizer = torch.optim.Adam(model_mean.parameters(), lr=0.01, weight_decay=1e-5)
      for _ in range(1, 501):
          train(model_mean, train_dataloader, device, optimizer)

      test_recall, test_precision, test_auroc, test_f1 = test(model_mean, test_dataloader, device)
      if test_f1>best_f1:
        best_f1=test_f1
        best_recall=test_recall
        best_precision=test_precision
        best_auroc=test_auroc

  current_metrics_df=pd.DataFrame()
  current_metrics_df['Step']=[step]
  current_metrics_df['Recall']=[best_recall]
  current_metrics_df['Precision']=[best_precision]
  current_metrics_df['Auroc']=[best_auroc]
  current_metrics_df['F1']=[best_f1]
  if metrics_df.shape[0]==0: 
    current_metrics_df.to_csv('Final_metrics_10_runs/metrics_GCN_mean.csv')
    return current_metrics_df
  else:
    metrics_df = pd.concat([metrics_df, current_metrics_df])
    metrics_df.to_csv('Final_metrics_10_runs/metrics_GCN_mean.csv')
    return metrics_df

In [13]:
for step in tqdm(range(1,11)):
    graphs_data, labels_data,graph_idx  = create_data(step)
    train_dataloader, test_dataloader = data_splitter(graphs_data, labels_data, graph_idx)
    if step == 1:
        metrics_df_mean = do_step_mean(train_dataloader, test_dataloader, step)
    else:
        metrics_df_mean = do_step_mean(train_dataloader, test_dataloader, step,metrics_df_mean)

100%|██████████| 10/10 [26:05:14<00:00, 9391.47s/it]  


### Teaug

In [24]:
td = complete_dataset.groupby('contract_address')[['from']].count().reset_index()
td1 = td[(td['from']>=10) & (td['from']<20)]['contract_address']
td2 = td[(td['from']>=20) & (td['from']<30)]['contract_address']
td3 = td[(td['from']>=30) & (td['from']<40)]['contract_address']
td4 = td[(td['from']>=40) & (td['from']<50)]['contract_address']
td5 = td[(td['from']>=50) & (td['from']<60)]['contract_address']
td6 = td[(td['from']>=60) & (td['from']<70)]['contract_address']
td7 = td[(td['from']>=70) & (td['from']<80)]['contract_address']
td8 = td[(td['from']>=80) & (td['from']<90)]['contract_address']
td9 = td[(td['from']>=90) & (td['from']<100)]['contract_address']
td10 = td[td['from']>=100]['contract_address']

addresses_step1 = complete_dataset[complete_dataset['contract_address'].isin(td1)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df1, test_df1 = train_test_split(addresses_step1, test_size=0.25,random_state=42, stratify=addresses_step1['is_ponzi'])
addresses_step2 = complete_dataset[complete_dataset['contract_address'].isin(td2)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df2, test_df2 = train_test_split(addresses_step2, test_size=0.25,random_state=42, stratify=addresses_step2['is_ponzi'])
addresses_step3 = complete_dataset[complete_dataset['contract_address'].isin(td3)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df3, test_df3 = train_test_split(addresses_step3, test_size=0.25,random_state=42, stratify=addresses_step3['is_ponzi'])
addresses_step4 = complete_dataset[complete_dataset['contract_address'].isin(td4)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df4, test_df4 = train_test_split(addresses_step4, test_size=0.25,random_state=42, stratify=addresses_step4['is_ponzi'])
addresses_step5 = complete_dataset[complete_dataset['contract_address'].isin(td5)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df5, test_df5 = train_test_split(addresses_step5, test_size=0.25,random_state=42, stratify=addresses_step5['is_ponzi'])
addresses_step6 = complete_dataset[complete_dataset['contract_address'].isin(td6)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df6, test_df6 = train_test_split(addresses_step6, test_size=0.25,random_state=42, stratify=addresses_step6['is_ponzi'])
addresses_step7 = complete_dataset[complete_dataset['contract_address'].isin(td7)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df7, test_df7 = train_test_split(addresses_step7, test_size=0.25,random_state=42, stratify=addresses_step7['is_ponzi'])
addresses_step8 = complete_dataset[complete_dataset['contract_address'].isin(td8)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df8, test_df8 = train_test_split(addresses_step8, test_size=0.25,random_state=42, stratify=addresses_step8['is_ponzi'])
addresses_step9 = complete_dataset[complete_dataset['contract_address'].isin(td9)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df9, test_df9 = train_test_split(addresses_step9, test_size=0.25,random_state=42, stratify=addresses_step9['is_ponzi'])
addresses_step10 = complete_dataset[complete_dataset['contract_address'].isin(td10)][['contract_address', 'is_ponzi']].drop_duplicates('contract_address').reset_index(drop=True).sample(frac=1).reset_index()
train_df10, test_df10 = train_test_split(addresses_step10, test_size=0.25,random_state=42, stratify=addresses_step10['is_ponzi'])

In [25]:
def train_test_df_calculator(step):
    if step == 1:
        train_dfs = pd.concat([train_df1, train_df2, train_df3, train_df4, train_df5, train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df1, test_df2, test_df3, test_df4, test_df5, test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 2:
        train_dfs = pd.concat([train_df2, train_df3, train_df4, train_df5, train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df2, test_df3, test_df4, test_df5, test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 3:
        train_dfs = pd.concat([train_df3, train_df4, train_df5, train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df3, test_df4, test_df5, test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 4:
        train_dfs = pd.concat([train_df4, train_df5, train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df4, test_df5, test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 5:
        train_dfs = pd.concat([train_df5, train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df5, test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 6:
        train_dfs = pd.concat([train_df6, train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df6, test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 7:
        train_dfs = pd.concat([train_df7, train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df7, test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 8:
        train_dfs = pd.concat([train_df8, train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df8, test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    elif step == 9:
        train_dfs = pd.concat([train_df9, train_df10], axis=0)
        test_dfs = pd.concat([test_df9, test_df10], axis=0)
        return train_dfs, test_dfs
    else:
        return train_df10, test_df10

In [31]:
complete_dataset=complete_dataset.fillna(0)

def create_data_teaug(step,data_togen):
  graphs_data = []
  labels_data = []
  graph_idx = []
  for contract in list(set(data_togen['contract_address'])):
      temp_df = complete_dataset[complete_dataset['contract_address']==contract].reset_index(drop=True).head(10*step)
      if temp_df.shape[0]>=10*step:
        G=nx.DiGraph()
        #nx.set_node_attributes(G, {node:{x:[attributes]}})
        #adding node features
        node_attributes = pd.merge(transaction_values_sent(temp_df), transaction_values_received(temp_df), on='identifier', how='outer')
        node_attributes['account_balance']=node_attributes.loc[:, ('Value_usd_x', 'sum')] - node_attributes.loc[:, ('Value_usd_y', 'sum')]
        node_attributes = pd.merge(node_attributes, errors_sent(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, errors_received(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, lifetimes_sent(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, lifetimes_received(temp_df), on='identifier', how='outer')
        node_attributes = pd.merge(node_attributes, transaction_costs_sent(temp_df), on='identifier', how='outer')
        #node_attributes['distinct_receiving_users'] = temp_df.groupby('from')[['to']].nunique()['to'].tolist()
        cluster_data = temp_df.iloc[0, -6:].tolist()
        node_attributes['cluster_0']=cluster_data[0]
        node_attributes['cluster_1']=cluster_data[1]
        node_attributes['cluster_2']=cluster_data[2]
        node_attributes['cluster_3']=cluster_data[3]
        node_attributes['cluster_4']=cluster_data[4]
        node_attributes['cluster_5']=cluster_data[5]
        node_attributes=node_attributes.fillna(0)

        aggregated_edges = aggregate_edges(temp_df)

        for j in range(aggregated_edges.shape[0]):
          #adding edges with their feaures
          from_node = aggregated_edges.loc[j, 'from']
          to_node = aggregated_edges.loc[j, 'to']
          edge_characts = aggregated_edges.loc[j, 'value']
          G.add_edge(from_node, to_node, edge_attr=edge_characts)


        for idx in node_attributes['identifier']:
          attrs = {idx: {'x':node_attributes[node_attributes['identifier']==idx].iloc[0, :].tolist()[1:]}}
          nx.set_node_attributes(G, attrs)

        temp_df['value_correct_sign'] = temp_df.apply(sign_calculator, axis=1)
        temp_df_internal = temp_df[temp_df['isinternal']==1]
        temp_df_external = temp_df[temp_df['isinternal']==0]
        graph_attributes = []
        #degrees
        contract_in_degrees=degree_calculator(temp_df_external)
        contract_out_degrees=degree_calculator(temp_df_internal)
        #number_transacts_users_indeg
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = number_transacts_user(temp_df_external)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #number_transacts_users_outdeg
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = number_transacts_user(temp_df_internal)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #errors_indeg
        total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb = errors_calculator(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb]
        #errors_outdeg
        total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb = errors_calculator(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, q1_numb, q3_numb]
        #distinct_users
        distinct_users_indeg=distinct_users(temp_df_external)
        distinct_users_outdeg=distinct_users(temp_df_internal)
        graph_attributes+=[distinct_users_indeg, distinct_users_outdeg]
        #transaction_value_indeg
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_value(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_value_outdeg
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_value(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #age_minutes
        avg_numb, std_numb, median_numb, q1_numb, q3_numb, max_numb = lifetime_calculator(temp_df)
        graph_attributes+=[avg_numb, std_numb, median_numb, q1_numb, q3_numb, max_numb]
        #inflows_from_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = inflows_from_users(temp_df_external)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #outflows_from_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = outflows_to_users(temp_df_internal)
        graph_attributes+=[avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_costs_contract
        total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_costs_contract(temp_df_internal)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        #transaction_costs_users
        avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb = transaction_costs_users(temp_df_external)
        graph_attributes+=[total_numb,avg_numb, std_numb, median_numb, min_numb, q1_numb, q3_numb, max_numb]
        G.graph.update({'contract_features':graph_attributes})

        label = temp_df.is_ponzi.iloc[0]
        graphs_data.append(G)
        labels_data.append(label)
        graph_idx.append(contract)
  return graphs_data, labels_data, graph_idx

In [32]:
def data_splitter_teaug(step, prev_train_data_graph=list(), prev_train_data_label=None, prev_train_data_idx=None):
  train, test = train_test_df_calculator(step)
  train_data_graph, train_data_label, train_data_idx = create_data_teaug(step, train)
  test_data_graph, test_data_label, test_data_idx = create_data_teaug(step, test)
  if len(prev_train_data_graph)>0:
    train_data_graph+=prev_train_data_graph
    train_data_label+=prev_train_data_label
    train_data_idx+=prev_train_data_idx
  dataset_train = FraudDataset(train_data_graph, train_data_label, train_data_idx)
  dataset_test = FraudDataset(test_data_graph, test_data_label, test_data_idx)
  train_dataloader = DataLoader(dataset_train, batch_size=500, shuffle=True)
  test_dataloader = DataLoader(dataset_test, batch_size=100, shuffle=False)
  return train_dataloader, test_dataloader, train_data_graph, train_data_label, train_data_idx

In [33]:
def do_step_mean_teaug(train_dataloader, test_dataloader, step, metrics_df = pd.DataFrame()):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  drop_rates = [0.1,0.2,0.3,0.4]
  layer_dims=[16,32,64,128]
  best_f1 = 0
  best_recall = 0
  best_precision = 0
  best_auroc = 0

  for drop_rate in drop_rates:
    for layer in layer_dims:
      model_mean = GCN_mean(hidden_channels=layer, dropout_frac=drop_rate).to(device)
      optimizer = torch.optim.Adam(model_mean.parameters(), lr=0.01, weight_decay=1e-5)
      for _ in range(1, 501):
          train(model_mean, train_dataloader, device, optimizer)

      test_recall, test_precision, test_auroc, test_f1 = test(model_mean, test_dataloader, device)
      if test_f1>best_f1:
        best_f1=test_f1
        best_recall=test_recall
        best_precision=test_precision
        best_auroc=test_auroc

  current_metrics_df=pd.DataFrame()
  current_metrics_df['Step']=[step]
  current_metrics_df['Recall']=[best_recall]
  current_metrics_df['Precision']=[best_precision]
  current_metrics_df['Auroc']=[best_auroc]
  current_metrics_df['F1']=[best_f1]
  if metrics_df.shape[0]==0: 
    current_metrics_df.to_csv('Final_metrics_10_runs/metrics_GCN_mean_TEAUG.csv')
    return current_metrics_df
  else:
    metrics_df = pd.concat([metrics_df, current_metrics_df])
    metrics_df.to_csv('Final_metrics_10_runs/metrics_GCN_mean_TEAUG.csv')
    return metrics_df

In [41]:
for step in tqdm(range(1,11)):
    if step == 1:
        train_dataloader, test_dataloader, train_data_graph, train_data_label, train_data_idx = data_splitter_teaug(step)
        metrics_df_mean = do_step_mean_teaug(train_dataloader, test_dataloader, step)
    else:
        train_dataloader, test_dataloader, train_data_graph, train_data_label, train_data_idx = data_splitter_teaug(step, train_data_graph, train_data_label, train_data_idx)
        metrics_df_mean = do_step_mean_teaug(train_dataloader, test_dataloader, step,metrics_df_mean)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [129:32:41<00:00, 46636.16s/it]  


## GATConv

In [17]:
from torch_geometric.nn import GATConv
class GAT_mean(torch.nn.Module):
    def __init__(self, hidden_channels, dropout_frac=0.5):
        super(GAT_mean, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GATConv(41, hidden_channels, heads=1) #41=node features
        self.lin1 = Linear(hidden_channels+80, hidden_channels)
        self.lin = Linear(hidden_channels, 2) #2=number of classes
        self.dropout_frac=dropout_frac

    def forward(self, data):
        x=data.x.to(torch.float32)
        x = self.conv1(x, data.edge_index)
        x = x.relu()
        x = global_mean_pool(x, data.batch)
        contract_feats=data.contract_features.to(torch.float32)
        contract_feats = contract_feats.reshape(len(data.label), 80)
        contract_feats[torch.isnan(contract_feats)] = 0
        x = torch.cat((x, contract_feats), dim=1)
        x =self.lin1(x)
        x = F.dropout(x, p=self.dropout_frac, training=self.training)
        x = self.lin(x)
        return F.log_softmax(x, dim=1)

def train(model, train_dataloader, device, optimizer):
    model.train()

    for data in train_dataloader:  # Iterate in batches over the training dataset.
         data = data.to(device)
         optimizer.zero_grad()  # Clear gradients.
         out = model(data)  # Perform a single forward pass.
         y_label = torch.tensor(data.label).to(device)
         #loss = F.nll_loss(out, y_label)
         loss = F.nll_loss(out, y_label)
         #loss = criterion(out, y_label)  # Compute the loss.
         #loss=criterion(torch.argmax(out, dim=1), y_label)
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.


def test(model, loader, device):
  model.eval()
  preds = []
  outs = []
  true_y=[]
  for data in loader:
      data = data.to(device)
      out = model(data)
      pred = out.argmax(dim=1)
      preds+=pred.tolist()
      true_y+=data.label
      outs+=out.tolist()
  outs=np.array(outs)
  return recall_score(true_y, preds), precision_score(true_y, preds), roc_auc_score(true_y,outs[:,1]), f1_score(true_y, preds)

In [18]:
def do_step_gat_mean(train_dataloader, test_dataloader, step, metrics_df = pd.DataFrame()):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  drop_rates = [0.1,0.2,0.3,0.4]
  layer_dims=[16,32,64,128]
  best_f1 = 0
  best_recall = 0
  best_precision = 0
  best_auroc = 0

  for drop_rate in drop_rates:
    for layer in layer_dims:
      model_mean = GAT_mean(hidden_channels=layer, dropout_frac=drop_rate).to(device)
      optimizer = torch.optim.Adam(model_mean.parameters(), lr=0.01, weight_decay=1e-5)
      for _ in range(1, 501):
          train(model_mean, train_dataloader, device, optimizer)

      test_recall, test_precision, test_auroc, test_f1 = test(model_mean, test_dataloader, device)
      if test_f1>best_f1:
        best_f1=test_f1
        best_recall=test_recall
        best_precision=test_precision
        best_auroc=test_auroc

  current_metrics_df=pd.DataFrame()
  current_metrics_df['Step']=[step]
  current_metrics_df['Recall']=[best_recall]
  current_metrics_df['Precision']=[best_precision]
  current_metrics_df['Auroc']=[best_auroc]
  current_metrics_df['F1']=[best_f1]
  if metrics_df.shape[0]==0: 
    current_metrics_df.to_csv('Final_metrics_10_runs/metrics_GAT_mean.csv')
    return current_metrics_df
  else:
    metrics_df = pd.concat([metrics_df, current_metrics_df])
    metrics_df.to_csv('Final_metrics_10_runs/metrics_GAT_mean.csv')
    return metrics_df

In [19]:
for step in tqdm(range(1,11)):
    graphs_data, labels_data,graph_idx  = create_data(step)
    train_dataloader, test_dataloader = data_splitter(graphs_data, labels_data, graph_idx)
    if step == 1:
        metrics_df_mean = do_step_gat_mean(train_dataloader, test_dataloader, step)
    else:
        metrics_df_mean = do_step_gat_mean(train_dataloader, test_dataloader, step,metrics_df_mean)

100%|██████████| 10/10 [28:52:48<00:00, 10396.83s/it] 


In [42]:
def do_step_gat_mean_teaug(train_dataloader, test_dataloader, step, metrics_df = pd.DataFrame()):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  drop_rates = [0.1,0.2,0.3,0.4]
  layer_dims=[16,32,64,128]
  best_f1 = 0
  best_recall = 0
  best_precision = 0
  best_auroc = 0

  for drop_rate in drop_rates:
    for layer in layer_dims:
      model_mean = GAT_mean(hidden_channels=layer, dropout_frac=drop_rate).to(device)
      optimizer = torch.optim.Adam(model_mean.parameters(), lr=0.01, weight_decay=1e-5)
      for _ in range(1, 501):
          train(model_mean, train_dataloader, device, optimizer)

      test_recall, test_precision, test_auroc, test_f1 = test(model_mean, test_dataloader, device)
      if test_f1>best_f1:
        best_f1=test_f1
        best_recall=test_recall
        best_precision=test_precision
        best_auroc=test_auroc

  current_metrics_df=pd.DataFrame()
  current_metrics_df['Step']=[step]
  current_metrics_df['Recall']=[best_recall]
  current_metrics_df['Precision']=[best_precision]
  current_metrics_df['Auroc']=[best_auroc]
  current_metrics_df['F1']=[best_f1]
  if metrics_df.shape[0]==0: 
    current_metrics_df.to_csv('Final_metrics_10_runs/metrics_GAT_mean_TEAUG.csv')
    return current_metrics_df
  else:
    metrics_df = pd.concat([metrics_df, current_metrics_df])
    metrics_df.to_csv('Final_metrics_10_runs/metrics_GAT_mean_TEAUG.csv')
    return metrics_df

In [43]:
for step in tqdm(range(1,11)):
    if step == 1:
        train_dataloader, test_dataloader, train_data_graph, train_data_label, train_data_idx = data_splitter_teaug(step)
        metrics_df_mean = do_step_gat_mean_teaug(train_dataloader, test_dataloader, step)
    else:
        train_dataloader, test_dataloader, train_data_graph, train_data_label, train_data_idx = data_splitter_teaug(step, train_data_graph, train_data_label, train_data_idx)
        metrics_df_mean = do_step_gat_mean_teaug(train_dataloader, test_dataloader, step,metrics_df_mean)

100%|██████████| 10/10 [128:33:40<00:00, 46282.06s/it]  
