In [40]:
import seaborn as sns
sns.set(rc={'axes.facecolor': 'dimgrey', 'grid.color': 'lightgrey'})
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch_geometric.nn import Node2Vec
from sklearn.ensemble import RandomForestClassifier
import torch
from torch_geometric.datasets import Planetoid
from tqdm.notebook import tqdm
from pygsp import graphs
from sklearn.metrics import classification_report

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [42]:
%run utils.ipynb

In [43]:
# class 1 is illicit, 2 is licit, unknown is unknown
labels = pd.read_csv("data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
edges = pd.read_csv("data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv")
nodes = pd.read_csv("data/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header=None)
print('done')

done


In [44]:
indexes = labels.index.tolist()
labels_new = labels
labels_new = labels_new.replace('unknown', 3)
nodes_new =  nodes
edges_new = edges.loc[edges["txId1"].isin(labels_new["txId"])].loc[edges["txId2"].isin(labels_new["txId"])]
nodes_new.shape, labels_new.shape, edges_new.shape

((203769, 167), (203769, 2), (234355, 2))

In [45]:
data, graph_info = time_step_split(nodes_new, edges_new, labels_new, device)
for key in data:
    print(key, len(data[key]))

train 31
val 5
test 13


In [46]:
data, graph_info = create_graph(nodes_new, edges_new, labels_new, device)

In [47]:
# generate test and train masks
train_p = 0.8
def generateMasks(train_p=train_p):
    # indeces of the illicit and licit transaction nodes
    illicit_idx = (data.y == 0).nonzero(as_tuple=True)[0]
    # Generate random permutation of indices
    illicit_train, illicit_test = torch.utils.data.random_split(illicit_idx, lengths=[train_p, (1-train_p)])

    # the same for the licit nodes
    licit_idx   = (data.y == 1).nonzero(as_tuple=True)[0]
    licit_train, licit_test = torch.utils.data.random_split(licit_idx, lengths=[train_p, (1-train_p)])
    # obtain the training and test "masks", i.e. indices for the train and test sets
    train_mask = torch.cat((torch.tensor(illicit_train), torch.tensor(licit_train)))
    test_mask  = torch.cat((torch.tensor(illicit_test), torch.tensor(licit_test)))
    return train_mask, test_mask



In [51]:
for i in range(10):
    # apply the training and test masks
    train_mask, test_mask = generateMasks()
    train_emb = data.x[train_mask]
    train_y   = data.y[train_mask]

    test_emb  = data.x[test_mask]
    test_y    = data.y[test_mask]

    # Create Decision Tree classifer object
    clf = RandomForestClassifier()
    # Train Decision Tree Classifer
    clf = clf.fit(train_emb, train_y)
    #Predict the response for test dataset
    y_pred = clf.predict(test_emb)

    print(classification_report(y_pred, test_y))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       807
           1       1.00      0.99      0.99      8504

    accuracy                           0.99      9311
   macro avg       0.94      0.99      0.97      9311
weighted avg       0.99      0.99      0.99      9311

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       809
           1       1.00      0.99      0.99      8502

    accuracy                           0.99      9311
   macro avg       0.94      0.99      0.96      9311
weighted avg       0.99      0.99      0.99      9311

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       802
           1       1.00      0.99      0.99      8509

    accuracy                           0.99      9311
   macro avg       0.94      0.99      0.96      9311
weighted avg       0.99      0.99      0.99      9311

              preci

In [10]:
# Define your Node2Vec model here
def defineNode2VecModel(edge_index):
    n2v = Node2Vec(edge_index=edge_index, embedding_dim=3, walk_length=20, context_size=10,
                   walks_per_node=10, num_negative_samples=1,
                   p=1, q=1, sparse=False)
    return n2v
# print(type(torch.tensor(data.edge_index)))
n2v = defineNode2VecModel(data.edge_index)

In [11]:
# training and test loops
# define the training function for the Node2Vec model here
def train(model, loader, optimizer):
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)
#########################################################################

#########################################################################
#define the test function for the Node2Vec model here
@torch.no_grad()
def test(model):
    model.eval()
    z = model()
    acc = model.test(z[train_mask], data.y[train_mask],
                   z[test_mask], data.y[test_mask],
                   max_iter=150)
    return acc
#########################################################################

In [12]:
#########################################################################
# train your Node2Vec model
train_epochs = 30
# Define a loader here
loader = n2v.loader(batch_size=128, shuffle=True)
# define an optimizer here
optimizer = torch.optim.Adam(list(n2v.parameters()), lr=0.01)

for epoch in tqdm(range(train_epochs)):
    loss = train(n2v, loader, optimizer)
    acc = test(n2v)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')
#########################################################################

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 000, Loss: 1.0179, Acc: 0.9025
Epoch: 001, Loss: 0.8527, Acc: 0.9025
Epoch: 002, Loss: 0.8294, Acc: 0.9025
Epoch: 003, Loss: 0.8249, Acc: 0.9025
Epoch: 004, Loss: 0.8236, Acc: 0.9025
Epoch: 005, Loss: 0.8230, Acc: 0.9025
Epoch: 006, Loss: 0.8226, Acc: 0.9025
Epoch: 007, Loss: 0.8223, Acc: 0.9025
Epoch: 008, Loss: 0.8218, Acc: 0.9025
Epoch: 009, Loss: 0.8216, Acc: 0.9025
Epoch: 010, Loss: 0.8215, Acc: 0.9025
Epoch: 011, Loss: 0.8213, Acc: 0.9025
Epoch: 012, Loss: 0.8210, Acc: 0.9025
Epoch: 013, Loss: 0.8210, Acc: 0.9025
Epoch: 014, Loss: 0.8209, Acc: 0.9025
Epoch: 015, Loss: 0.8208, Acc: 0.9025
Epoch: 016, Loss: 0.8208, Acc: 0.9025
Epoch: 017, Loss: 0.8206, Acc: 0.9025
Epoch: 018, Loss: 0.8205, Acc: 0.9025
Epoch: 019, Loss: 0.8204, Acc: 0.9025
Epoch: 020, Loss: 0.8203, Acc: 0.9025
Epoch: 021, Loss: 0.8202, Acc: 0.9025
Epoch: 022, Loss: 0.8200, Acc: 0.9025
Epoch: 023, Loss: 0.8200, Acc: 0.9025
Epoch: 024, Loss: 0.8198, Acc: 0.9025
Epoch: 025, Loss: 0.8198, Acc: 0.9025
Epoch: 026, 

In [23]:
#########################################################################
# produce embedding using the trained model
n2v.eval()
emb = n2v(torch.arange(data.num_nodes))
# visualize the features here
emb = emb.detach().numpy()
print(len(emb))
#########################################################################

203769


In [24]:
# apply the training and test masks
train_emb = emb[train_mask]
train_y   = data.y[train_mask]

test_emb  = emb[test_mask]
test_y    = data.y[test_mask]

print(train_emb)
print(test_y)

[[ 0.2813215   0.2716136   0.31932402]
 [ 0.5437697   0.18883954 -0.37878326]
 [-1.1976874  -1.0985088   0.81091505]
 ...
 [ 0.54584605  0.29015455 -0.5372091 ]
 [-0.10954577  0.32879618 -0.03851182]
 [-0.03797907 -0.42184383 -0.2959392 ]]
tensor([0, 0, 0,  ..., 1, 1, 1])


In [25]:
# Now train the DT classifier
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [26]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(train_emb, train_y)

#Predict the response for test dataset
y_pred = clf.predict(test_emb)

In [27]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Accuracy: 0.8264418429814199


In [24]:
torch.save(n2v, 'BaselineModel.pt')