In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import datetime

In [None]:
!pip install dgl

Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.2 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1


In [None]:
import dgl
from dgl.data import DGLDataset
import torch

Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


In [None]:
GRAPH_PATHS = "/content/drive/MyDrive/data/processed_graphs.csv"
LABELS_PATHS = "/content/drive/MyDrive/data/processed_graphs_labels.csv"

In [None]:
graphs = pd.read_csv(GRAPH_PATHS)
graphs.head()

Unnamed: 0.2,idx,date,Unnamed: 0,from,to,amount,timestamp,fromIsPhi,toIsPhi,isPhi,Unnamed: 0.1
0,0,2016-11-30,87613,644996311924151884824924215840682271362850202643,1249665516472213179549841288377450362199154772633,0.984401,1480464000.0,0,0,0,
1,0,2016-11-30,87614,644996311924151884824924215840682271362850202643,119639879959584517570242490761449175088357594858,98.431564,1480465000.0,0,0,0,
2,0,2016-11-30,87615,644996311924151884824924215840682271362850202643,154744709618196328765779089700513530202844453134,11.650149,1480466000.0,0,0,0,
3,0,2016-11-30,87616,644996311924151884824924215840682271362850202643,132541851783283011242317528707846741855867577893,0.5,1480466000.0,0,0,0,
4,0,2016-11-30,87617,644996311924151884824924215840682271362850202643,218017970891763324453226269432084353002068817149,1.025745,1480469000.0,0,0,0,


In [None]:
labels_df = pd.read_csv(LABELS_PATHS)

In [None]:
labels_df.to_dict().keys()

dict_keys(['idx', 'isPhi'])

In [None]:
graphs.shape

(13108066, 11)

In [None]:
graphs["idx"].min(), graphs["idx"].max()

(0, 677)

In [None]:
graphs[graphs["idx"]==564]

Unnamed: 0.2,idx,date,Unnamed: 0,from,to,amount,timestamp,fromIsPhi,toIsPhi,isPhi,Unnamed: 0.1
13093106,564,2018-05-18,4418,417132657103527686872755167341514236553709721932,1089823903414201205164122060798283676514672446537,12.922234,1.526631e+09,1,0,1,1526543.0
13093107,564,2018-05-18,10119,831717449580711281399849925820013600398097604408,1371495397093811549783842019461889399503387454944,0.004000,1.526676e+09,1,0,1,2304559.0
13093108,564,2018-05-18,11766,1409663306667885651423262808042740245817942691496,602592554680780429221369705499595817778080677226,3.500000,1.526653e+09,1,0,1,2378410.0
13093109,564,2018-05-18,11767,1409663306667885651423262808042740245817942691496,118115836559587621209836734596617560084568453541,0.166548,1.526675e+09,1,0,1,2378411.0
13093110,564,2018-05-18,16759,267571195878157707859385628392535586552873081469,1365260248428791526954242325594194917400680954374,0.000000,1.526612e+09,1,0,1,2998820.0
...,...,...,...,...,...,...,...,...,...,...,...
13093324,564,2018-05-19,58702,924616232574883135933849564716800706399164234688,53794421113296276641178500177478558257267273898,0.500000,1.526764e+09,0,1,1,11593317.0
13093325,564,2018-05-19,58730,1155306758554226815179165349248141155506604406060,600927757527250112437739358851513177152226224959,0.965986,1.526706e+09,0,1,1,11640576.0
13093326,564,2018-05-19,59485,1358676735433481847001354959305686146876633934119,600927757527250112437739358851513177152226224959,0.566804,1.526700e+09,0,1,1,12301905.0
13093327,564,2018-05-19,59561,1367062735712770626846668508814824715522883882245,912733475627262696937153949743516851310396616647,0.500000,1.526715e+09,0,1,1,12367485.0


In [None]:
def get_after_date(graphs, dt, format="%Y-%m-%d"):
  filt = graphs.index.get_level_values('date') > datetime.datetime.strptime(dt, format).date()
  return graphs[filt].copy()

In [None]:
# get a specific window and flatten dataframe
def get_id_drop_idx(graphs, id):
  filt = graphs['idx'] == id
  new_graph = graphs[filt].copy()
  return new_graph.reset_index().drop("idx",axis=1)

In [None]:
# convert this edgelist dataframe to graph and set node attributes
def convert_df_graph(df):
  df["date"] = df["date"].astype(str)
  G = nx.convert_matrix.from_pandas_edgelist(df, "from", "to", edge_attr=True,create_using=nx.MultiDiGraph())
  node_attrs = {}
  for node in G.nodes():
    node_attrs[node] = {"addr": node}

  for src, dest, data in G.edges(data=True):
    node_attrs[src]["isPhi"] = bool(data["fromIsPhi"])
    node_attrs[dest]["isPhi"] = bool(data['toIsPhi'])
  
  nx.classes.function.set_node_attributes(G, node_attrs)
  return G

In [None]:
id = 10
graph_10 = get_id_drop_idx(graphs,id)

In [None]:
G = convert_df_graph(graph_10.copy())

In [None]:
nx.get_node_attributes(G, 'isPhi')

{'1000713045940827972690001710540886700369053993047': False,
 '1000942403950909833863517645991900250160766152210': False,
 '1009154706589712343351112407244798585226051800340': False,
 '1011140311372880001940842502745670651899775745998': False,
 '101165142238800537507615282012156079311532853537': False,
 '1012838553379146707749668524196639160452340866043': False,
 '1013194169978669867566164354562631207348600282042': False,
 '1014721464434919204620673947316488048405051184628': False,
 '1016506913799606664037288661527254174353594632706': False,
 '1017411154125381859838983182247550759290648779318': False,
 '1018812289164589108701056071119852059302300671311': False,
 '1021697379985255570917970927342199804082204085525': False,
 '1022593106643289925588707084106261178675110122891': False,
 '1024374239713693074145213316179862254974921274218': False,
 '1024455675954027554701317792781399242412980355935': False,
 '1028460396741816097272679544414273535271704563312': False,
 '10344363011447860185908

In [None]:
len(G.nodes()), len(G.edges)

(637, 8184)

In [None]:
G_dg = dgl.from_networkx(G, node_attrs=["isPhi"], edge_attrs=['timestamp','amount'])

In [None]:
G_dg.ndata["isPhi"]

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [None]:
import random
import math
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
class GraphDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='ethereum_tx')

    def process(self):
        graphs_df = pd.read_csv("/content/drive/MyDrive/data/processed_graphs.csv")
        labels_df = pd.read_csv("/content/drive/MyDrive/data/processed_graphs_labels.csv")
        self.graphs = []
        self.labels = []
        labels_dict = labels_df.to_dict()
        id_range = list(range(graphs_df["idx"].min(), graphs_df["idx"].max()))
        random.shuffle(id_range)
        for id in id_range:
          graph = get_id_drop_idx(graphs_df,id)
          G = convert_df_graph(graph.copy())
          g = dgl.from_networkx(G, node_attrs=["isPhi"], edge_attrs=['timestamp','amount'])
          g = dgl.add_self_loop(g)
          self.graphs.append(g)
          label = labels_dict["isPhi"][id]
          self.labels.append(label)

        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

In [None]:
dataset = GraphDataset()

In [None]:
import dgl.nn.pytorch as dglnn
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Classifier, self).__init__()
        self.conv1 = dglnn.GraphConv(in_dim, hidden_dim,)
        self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = dgl.mean_nodes(g, 'h')
            return self.classify(hg)

In [None]:
batch_size = 32
split_ratio = 0.8
num_entries = len(dataset)
indices = list(range(num_entries))
np.random.seed(0)
np.random.shuffle(indices)
split = int(math.floor(split_ratio * num_entries))
train_idx, valid_idx = indices[:split], indices[split:]

In [None]:
from dgl.dataloading import GraphDataLoader
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = GraphDataLoader(
    dataset, sampler=train_sampler,
    batch_size=batch_size)
test_loader = GraphDataLoader(
    dataset, sampler=valid_sampler,
    batch_size=batch_size)

In [None]:
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {len(data[1])}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 32
[Graph(num_nodes=239695, num_edges=904428,
      ndata_schemes={'isPhi': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'timestamp': Scheme(shape=(), dtype=torch.float32), 'amount': Scheme(shape=(), dtype=torch.float32)}), tensor([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 0, 0, 0, 0])]

Step 2:
Number of graphs in the current batch: 32
[Graph(num_nodes=188693, num_edges=689676,
      ndata_schemes={'isPhi': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'timestamp': Scheme(shape=(), dtype=torch.float32), 'amount': Scheme(shape=(), dtype=torch.float32)}), tensor([1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 1])]

Step 3:
Number of graphs in the current batch: 32
[Graph(num_nodes=216436, num_edges=779731,
      ndata_schemes={'isPhi': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'timestamp': Scheme(shap

In [None]:
def accuracy():
    
    model.eval()
    test_accuracy, train_accuracy = (0.0, 0.0)
    test_total, train_total = (0.0, 0.0)
    test_loss, train_loss = (0.0,0.0)
    with torch.no_grad():
        for batched_graph, labels in test_loader:
            feats = batched_graph.ndata['isPhi']
            feats = torch.unsqueeze(feats, dim=-1)
            outputs = model(batched_graph, feats)
            _, predicted = torch.max(outputs.data, 1)
            loss = F.cross_entropy(outputs, labels)
            test_loss += loss.item()
            test_total += labels.size(0)
            test_accuracy += (predicted == labels).sum().item()

        for batched_graph, labels in train_loader:
            feats = batched_graph.ndata['isPhi']
            feats = torch.unsqueeze(feats, dim=-1)
            outputs = model(batched_graph, feats)
            loss = F.cross_entropy(outputs, labels)
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_accuracy += (predicted == labels).sum().item()
    
    # compute the accuracy over all test images
    test_accuracy = (100 * test_accuracy / test_total)
    train_accuracy = (100 * train_accuracy / train_total)
    return(train_accuracy, train_loss, test_accuracy, test_loss)

In [None]:
from tqdm import tqdm

In [None]:
import torch.nn.functional as F

model = Classifier(1, 64 , 2)
opt = torch.optim.Adam(model.parameters())
epochs = 30
for epoch in range(epochs):
  running_loss = 0.0
  epoch_loss = 0.0
  for i, (batched_graph, labels) in tqdm(enumerate(train_loader, 0)):
    feats = batched_graph.ndata['isPhi']
    feats = torch.unsqueeze(feats, dim=-1) # add an empty dimension to make the tensor (no_of_nodes, no_of_features)
    # print(feats.shape, batched_graph)
    logits = model(batched_graph, feats)
    loss = F.cross_entropy(logits, labels)
    opt.zero_grad()
    loss.backward()
    opt.step()
  train_accuracy, train_loss, test_accuracy, test_loss = accuracy()
  print(f"loss:{train_loss} - acc:{train_accuracy} - val_loss:{test_loss} - val_acc:{test_accuracy} Epoch {epoch}/{epochs}")

17it [00:10,  1.61it/s]


loss:11.334281206130981 - acc:62.84658040665435 - val_loss:3.3168553709983826 - val_acc:66.91176470588235 Epoch 0/30


17it [00:10,  1.65it/s]


loss:10.970967710018158 - acc:73.19778188539742 - val_loss:3.1932153701782227 - val_acc:75.73529411764706 Epoch 1/30


17it [00:10,  1.64it/s]


loss:10.3985196352005 - acc:83.73382624768946 - val_loss:3.0399585962295532 - val_acc:86.02941176470588 Epoch 2/30


17it [00:10,  1.63it/s]


loss:9.599045157432556 - acc:91.86691312384472 - val_loss:2.8447569012641907 - val_acc:89.70588235294117 Epoch 3/30


17it [00:10,  1.62it/s]


loss:8.589718908071518 - acc:94.63955637707949 - val_loss:2.5162572264671326 - val_acc:91.91176470588235 Epoch 4/30


17it [00:10,  1.63it/s]


loss:7.4718466103076935 - acc:95.93345656192237 - val_loss:2.318570077419281 - val_acc:94.11764705882354 Epoch 5/30


17it [00:10,  1.62it/s]


loss:6.343351304531097 - acc:96.30314232902033 - val_loss:1.9093029499053955 - val_acc:94.11764705882354 Epoch 6/30


17it [00:10,  1.62it/s]


loss:5.278871834278107 - acc:96.6728280961183 - val_loss:1.6215864419937134 - val_acc:94.11764705882354 Epoch 7/30


17it [00:10,  1.62it/s]


loss:4.331027567386627 - acc:96.85767097966728 - val_loss:1.293643444776535 - val_acc:94.11764705882354 Epoch 8/30


17it [00:10,  1.61it/s]


loss:3.5459582060575485 - acc:97.04251386321627 - val_loss:1.1094834506511688 - val_acc:95.58823529411765 Epoch 9/30


17it [00:10,  1.61it/s]


loss:2.9416330456733704 - acc:97.22735674676525 - val_loss:0.8788270056247711 - val_acc:95.58823529411765 Epoch 10/30


17it [00:10,  1.62it/s]


loss:2.45973452180624 - acc:97.7818853974122 - val_loss:0.7480950504541397 - val_acc:97.05882352941177 Epoch 11/30


17it [00:10,  1.62it/s]


loss:2.069731943309307 - acc:97.59704251386322 - val_loss:0.7094275429844856 - val_acc:97.05882352941177 Epoch 12/30


17it [00:10,  1.62it/s]


loss:1.7786326259374619 - acc:98.52125693160814 - val_loss:0.6294552683830261 - val_acc:97.05882352941177 Epoch 13/30


17it [00:10,  1.61it/s]


loss:1.5562224052846432 - acc:98.52125693160814 - val_loss:0.5385141298174858 - val_acc:97.05882352941177 Epoch 14/30


17it [00:10,  1.62it/s]


loss:1.4047459326684475 - acc:98.70609981515712 - val_loss:0.449204184114933 - val_acc:98.52941176470588 Epoch 15/30


17it [00:10,  1.63it/s]


loss:1.2383583206683397 - acc:98.52125693160814 - val_loss:0.4308410957455635 - val_acc:97.79411764705883 Epoch 16/30


17it [00:10,  1.64it/s]


loss:1.117695052176714 - acc:98.70609981515712 - val_loss:0.4213591329753399 - val_acc:98.52941176470588 Epoch 17/30


17it [00:10,  1.62it/s]


loss:1.0264892391860485 - acc:98.70609981515712 - val_loss:0.31786374375224113 - val_acc:98.52941176470588 Epoch 18/30


17it [00:12,  1.41it/s]


loss:0.9497781582176685 - acc:98.8909426987061 - val_loss:0.3499366082251072 - val_acc:98.52941176470588 Epoch 19/30


17it [00:10,  1.64it/s]


loss:0.8725339658558369 - acc:98.8909426987061 - val_loss:0.2610916420817375 - val_acc:98.52941176470588 Epoch 20/30


17it [00:10,  1.64it/s]


loss:0.8249136470258236 - acc:98.8909426987061 - val_loss:0.23675587959587574 - val_acc:98.52941176470588 Epoch 21/30


17it [00:10,  1.64it/s]


loss:0.7645007576793432 - acc:98.8909426987061 - val_loss:0.24318293668329716 - val_acc:98.52941176470588 Epoch 22/30


17it [00:10,  1.63it/s]


loss:0.7230782974511385 - acc:99.07578558225508 - val_loss:0.2182357832789421 - val_acc:99.26470588235294 Epoch 23/30


17it [00:10,  1.63it/s]


loss:0.6844300981611013 - acc:99.07578558225508 - val_loss:0.2025303728878498 - val_acc:99.26470588235294 Epoch 24/30


17it [00:10,  1.63it/s]


loss:0.6522929044440389 - acc:99.07578558225508 - val_loss:0.1819078903645277 - val_acc:99.26470588235294 Epoch 25/30


17it [00:10,  1.62it/s]


loss:0.6198496473953128 - acc:99.26062846580406 - val_loss:0.17335398122668266 - val_acc:99.26470588235294 Epoch 26/30


17it [00:10,  1.63it/s]


loss:0.593674186617136 - acc:99.26062846580406 - val_loss:0.1791835855692625 - val_acc:99.26470588235294 Epoch 27/30


17it [00:10,  1.63it/s]


loss:0.5718496413901448 - acc:99.26062846580406 - val_loss:0.15356101095676422 - val_acc:99.26470588235294 Epoch 28/30


17it [00:10,  1.64it/s]


loss:0.5447479914873838 - acc:99.26062846580406 - val_loss:0.14561724942177534 - val_acc:99.26470588235294 Epoch 29/30


In [None]:
accuracy()

(99.26062846580406, 0.5447808410972357, 99.26470588235294, 0.16586731560528278)

In [None]:
print(dgl.__version__)

0.6.1


### Trial GNN

In [None]:
import dgl.data
dataset_tag = dgl.data.GINDataset('MUTAG', False)

Downloading /root/.dgl/GINDataset.zip from https://raw.githubusercontent.com/weihua916/powerful-gnns/master/dataset.zip...
Extracting file to /root/.dgl/GINDataset


In [None]:
import dgl.nn.pytorch as dglnn
import torch.nn as nn

class Trial_Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Trial_Classifier, self).__init__()
        self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
        self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = dgl.mean_nodes(g, 'h')
            return self.classify(hg)

In [None]:
from dgl.dataloading import GraphDataLoader
trial_dataloader = GraphDataLoader(
    dataset_tag,
    batch_size=1024,
    drop_last=False,
    shuffle=True)

In [None]:
import torch.nn.functional as F

# Only an example, 7 is the input feature size
model = Trial_Classifier(7, 32, 5)
opt = torch.optim.Adam(model.parameters())
for epoch in range(20):
    for batched_graph, labels in trial_dataloader:
        feats = batched_graph.ndata['attr']
        print(batched_graph.ndata['attr'].shape, batched_graph)
        break
        logits = model(batched_graph, feats)
        loss = F.cross_entropy(logits, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
    break

torch.Size([3371, 7]) Graph(num_nodes=3371, num_edges=7442,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(7,), dtype=torch.float32)}
      edata_schemes={})
