<a href="https://colab.research.google.com/github/AlessandroPaparella/BR-transaction-classifier/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

In [None]:
#Get access to gdrive space to import datasets...
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Import training and validation datasets obtained with previous split (with preprocessing.py)
!cp drive/MyDrive/training.csv ./training.csv
!cp drive/MyDrive/validation.csv ./validation.csv

TRAINING_LEN = 738390
VALIDATION_LEN = 82044

In [None]:
#Dataset to 3d tensor converion ([Samples, Time, Features])
#Sample: Just TX row
#Time: Sequence of the Tx trace
#Feature: value, type etc... of the single calls

import json, ast, textwrap
import numpy as np

#Features len
TYPE_LEN=1
INPUT_LEN=32
OUTPUT_LEN=32
GAS_LEN=1
GASUSED_LEN=1
CALLS_LEN=1

TOTAL_LEN=TYPE_LEN+INPUT_LEN+OUTPUT_LEN+GAS_LEN+GASUSED_LEN+CALLS_LEN

#Max calls per sample
MAX_CALLS=150

calls = {
    "CALL": 1,
    "STATICCALL": 2,
    "DELEGATECALL": 3,
    "CALLCODE": 4,
    "CREATE": 5,
    "SELFDESTRUCT": 6
}

#Horizontal padding for time dimension
def pad():
  p=[]
  for i in range(68):
    p.append(0)
  return p

#Split a large hex number "n" (string format) into "p" 64bit token
def split(n, p):
  tokens = textwrap.wrap(n, 16)
  for i in range(len(tokens)):
    tokens[i]=str("0x"+tokens[i])
    tokens[i] = int(tokens[i], 0) % 2 ** 64
    if i==p:
      break
  #pad to passed length "p"
  if len(tokens)<p:
    m = p-len(tokens)
    for i in range(m):
      tokens.append(0)
  return tokens[:p]

#Join array "tok" into "row"
def insertInRow(tok, row):
  for t in tok:
    row.append(t)
  return row

total_data = []

#Explore trace with DFS and put all calls into a 2d matrix (time x calls)
def DFS(df):
    global total_data
    t = []
    t.append(calls[df['type']])
    t.append(int(df['gas'], 0) if "gas" in df else 0)
    tok = split(df['input'][2:], INPUT_LEN) if "input" in df else split("0", INPUT_LEN)
    t = insertInRow(tok, t)
    tok = split(df['output'][2:], OUTPUT_LEN) if "output" in df else split("0", OUTPUT_LEN)
    t = insertInRow(tok, t)
    t.append(int(df['gasUsed'], 0) if "output" in df else 0)
    #Bool flag that report if there are other nested calls or not
    t.append(1 if "calls" in df else 0)
    total_data.append(t)
    if "calls" in df:
        for d in df["calls"]:
          DFS(d)


def calls_to_tensor(df):
  examples=[]
  for d in df.itertuples():
    #Get txTrace column into a tree
    txTrace=ast.literal_eval(d[1])
    global total_data
    total_data=[]
    DFS(txTrace)
    #Pad to MAX CALLS
    i=len(total_data)
    while i<150:
      total_data.append(pad())
      i+=1
    examples.append(total_data[:150])
  return tf.convert_to_tensor(np.array(examples), dtype=tf.uint64)

In [None]:
#Load data for training in batch mode

def load_data(Train_df,idx,
              batch_size):
    df = pd.read_csv(
                  Train_df, skiprows=idx*batch_size,
                  nrows=batch_size)
    df.columns = ['txTrace', 'Label0', 'Label1']
    x = calls_to_tensor(df)
    y=tf.convert_to_tensor(df['Label0'], dtype=tf.uint64)
    return (x, y)

In [None]:
#Connect to cluster TPU and get strategy distribution

import os
resolver = tf.distribute.cluster_resolver.TPUClusterResolver('grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver) 

In [None]:
#Define model for classification

from keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input

def create_model():
  model = Sequential()
  model.add(Input((150,TOTAL_LEN)))
  model.add(LSTM(units=4096))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
#Create model and compile
with strategy.scope():
    classification_model = create_model()
    classification_model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), metrics=[tf.keras.metrics.AUC()])

In [None]:
import shutil
#Training
EPOCHS = 6
BATCH_SIZE = 2048

steps_per_epoch=np.ceil(TRAINING_LEN/BATCH_SIZE)
validation_steps=np.ceil(VALIDATION_LEN/BATCH_SIZE)

t_loss = 0
t_auc = 0
for e in range(EPOCHS):
  print("Epoch "+str(e+1))
  for i in range(int(steps_per_epoch)):
    train, y_train = load_data("training.csv", i, BATCH_SIZE)
    total_data = []
    loss, auc = classification_model.train_on_batch(train, y_train)
    t_loss+=loss
    t_auc+=auc
    print("\rLoss: "+str(t_loss/(i+1))+" Auc: "+str(t_auc/(i+1))+" Steps: "+str(i)+"/"+str(steps_per_epoch),end=' ')
  #Save weights when epoch ends and backup on gdrive space, e.g: class_epoch_1.h5 etc... 
  file_name = "./class_epoch_"+str(e+1)+".h5"
  classification_model.save_weights(file_name)
  shutil.copy("/content/"+file_name, "drive/MyDrive/"+file_name)
  #Perform validation
  results = []
  for i in range(int(validation_steps)):
    test, y_test = load_data("validation.csv", i, BATCH_SIZE)
    total_data = []
    loss, auc = classification_model.test_on_batch(test, y_test)
    results.append((loss, auc))
  val_loss = 0
  val_auc = 0
  for i in range(len(results)):
    val_loss+=results[i][0]
    val_auc +=results[i][1]
  val_loss=val_loss/len(results)
  val_auc=val_auc/len(results)
  print("val_loss: "+str(val_loss)+" val_auc: "+str(val_auc))

