In [1]:
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Dataset, Data, InMemoryDataset
import torch_geometric.nn as nn
from torch_geometric.loader import DataLoader

In [2]:
import os
labels = ['addr_contract', 'caller', 'msgvalue', 'balance', 'call_data', 'blk', 'mdata', 'sdata', 'create', 'call', 'callcode', 'delegatecall', 'create2', 'staticcall', 'cal_res', 'comp_res', 'bit_res', 'size', 'code', 'gas', 'return', 'coinbase', 'gasremain', 'revert', 'selfdestruct', 'memory', 'storage', 'flowcontrol']
node_types = ['ADDRESS', 'ORIGIN', 'CALLER', 'CALLVALUE', 'BALANCE', 'SELFBALANCE', 'CALLDATALOAD', 'CALLDATACOPY', 'BLOCKHASH', 'TIMESTAMP', 'NUMBER', 'DIFFICULTY', 'BASEFEE', 'MLOAD', 'SLOAD', 'CREATE', 'CALL', 'CALLCODE', 'DELEGATECALL', 'CREATE2', 'STATICCALL', 'ADD', 'MUL', 'SUB', 'EXP', 'LT', 'GT', 'SLT', 'SGT', 'EQ', 'ISZERO', 'AND', 'OR', 'XOR', 'NOT', 'SHL', 'CALLDATASIZE', 'CODESIZE', 'EXTCODESIZE', 'RETURNDATASIZE', 'MSIZE', 'CODECOPY', 'EXTCODECOPY', 'EXTCODEHASH', 'GASPRICE', 'GASLIMIT', 'RETURNDATACOPY', 'RETURN', 'COINBASE', 'GAS', 'REVERT', 'SELFDESTRUCT', 'MSTORE', 'MSTORE8', 'SSTORE', 'JUMP', 'JUMPI', 'JUMPDEST', 'STOP', 'DIV', 'SDIV', 'MOD', 'SMOD', 'ADDMOD', 'SIGNEXTEND', 'BYTE', 'SHR', 'SAR', 'SHA3', 'CHAINID', 'POP', 'PC', 'PUSH1', 'PUSH2', 'PUSH3', 'PUSH4', 'PUSH5', 'PUSH6', 'PUSH7', 'PUSH8', 'PUSH9', 'PUSH10', 'PUSH11', 'PUSH12', 'PUSH13', 'PUSH14', 'PUSH15', 'PUSH16', 'PUSH17', 'PUSH18', 'PUSH19', 'PUSH20', 'PUSH21', 'PUSH22', 'PUSH23', 'PUSH24', 'PUSH25', 'PUSH26', 'PUSH27', 'PUSH28', 'PUSH29', 'PUSH30', 'PUSH31', 'PUSH32', 'DUP1', 'DUP2', 'DUP3', 'DUP4', 'DUP5', 'DUP6', 'DUP7', 'DUP8', 'DUP9', 'DUP10', 'DUP11', 'DUP12', 'DUP13', 'DUP14', 'DUP15', 'DUP16', 'SWAP1', 'SWAP2', 'SWAP3', 'SWAP4', 'SWAP5', 'SWAP6', 'SWAP7', 'SWAP8', 'SWAP9', 'SWAP10', 'SWAP11', 'SWAP12', 'SWAP13', 'SWAP14', 'SWAP15', 'SWAP16', 'LOGO', 'LOG1', 'LOG2', 'LOG3', 'LOG4', 'PUSH', 'DUP', 'SWAP']
node_attrs = node_types + labels
class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        self.Ngraph = 65
        super().__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        ngraph = self.Ngraph
        vers = [f'{idx}.ver' for idx in range(ngraph)]
        edgs = [f'{idx}.edg' for idx in range(ngraph)]
        bugs = [f'{idx}.type' for idx in range(ngraph)]
        return vers + edgs + bugs

    @property
    def processed_file_names(self):
        ngraph = self.Ngraph
        graphs = [f'{idx}.grap' for idx in range(ngraph)]
        return graphs

    def download(self):
        #  download the source file to `self.raw_dir`.
        pass
        print("in download")
        # raise RuntimeError("in download")

    def process(self):
        self.exist_processed_file_names = []
        for i, j, files in os.walk(self.processed_dir):
            self.exist_processed_file_names = files
            break

        for f in self.processed_file_names:
          if f not in self.exist_processed_file_names:
            print(f"process new file {f}")
            out_path = os.path.join(self.processed_dir, f)
            data = self._process_per_graph(f)
            torch.save(data, out_path)
    
    def _process_per_graph(self, f):
        idx = f[:-5]
        verPath = os.path.join(self.raw_dir, idx+'.ver')
        edgPath = os.path.join(self.raw_dir, idx+'.edg')
        bugPath = os.path.join(self.raw_dir, idx+'.type')
        edge_index = []
        edge_attr = []
        x = []
        for line in open(edgPath, "r"):
            line = line.strip('\n')
            line = line.replace(' ', '').split(',')
            link = [int(line[0]), int(line[1])]
            edge_index.append(link)
            if line[2] == 'exec':
                attr = [0, int(line[3])]
            else:
                attr = [1, int(line[3])]
            edge_attr.append(attr)
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)

        for line in open(verPath, "r"):
            line = line.strip('\n')
            attrOneHot = [0 for i in range(len(node_attrs))]
            line = line.replace(' ', '').replace('\'','')
            attr_begin = line.index('[')+1
            nodeType = line[:attr_begin-1].split(',')[1]
            attrList = line[attr_begin:-1].split(',')
            if '' in attrList:
                attrList.remove('')
            attrList.append(nodeType)
            for attr in attrList:
                idx = node_attrs.index(attr)
                attrOneHot[idx] = 1
            x.append(attrOneHot)
        x = torch.tensor(x, dtype=torch.float)

        for line in open(bugPath, "r"):
          line = line.strip('\n')
          y = torch.tensor(int(line), dtype=torch.long)

        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        return data
    def len(self):
        return len(self.processed_file_names)

    def getitem(self, idx):
        data = torch.load(os.path.join(self.processed_dir, self.processed_file_names[idx]))
        return data
    
    def get(self, idx):
        return self.getitem(idx)

In [43]:
class MulLayerGNN(torch.nn.Module):
  def __init__(self, edge_dim=2, in_ch=172, hidden_ch=200, n_layers=2, drop_rate=0.5, JK="last", residual=False):
    super(MulLayerGNN, self).__init__()
    self.n_layers = n_layers
    self.drop_rate = drop_rate
    self.JK = JK
    self.residual = residual
    self.convs = torch.nn.ModuleList()
    self.batch_norms = torch.nn.ModuleList()
    # self.convs.append(nn.GINEConv(
    #       nn.Sequential('x', [(nn.Linear(in_ch, hidden_ch), 'x->x'),
    #                           (torch.nn.ReLU(), 'x->x'),
    #                           (nn.Linear(hidden_ch, hidden_ch), 'x->x')]),
    #                     edge_dim=edge_dim))
    self.convs.append(nn.GINEConv(
        torch.nn.Sequential(
            nn.Linear(in_ch, hidden_ch),
            torch.nn.ReLU(),
            nn.Linear(hidden_ch, hidden_ch)),
        edge_dim=edge_dim))
    self.batch_norms.append(torch.nn.BatchNorm1d(hidden_ch))
    
    for layer in range(1, n_layers):
      self.convs.append(nn.GINEConv(
          torch.nn.Sequential(
              nn.Linear(hidden_ch, hidden_ch),
              torch.nn.ReLU(),
              nn.Linear(hidden_ch, hidden_ch)),
          edge_dim=edge_dim))
      self.batch_norms.append(torch.nn.BatchNorm1d(hidden_ch))
  def forward(self, data):
      x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
      h_list = [x]
      for layer in range(self.n_layers):
        h = self.convs[layer](x=h_list[layer], edge_index=edge_index, edge_attr=edge_attr)
        h = self.batch_norms[layer](h)
        if layer == self.n_layers - 1:
          # remove relu for the last layer
          h = F.dropout(h, self.drop_rate, training=self.training)
        else:
          h = F.dropout(F.relu(h), self.drop_rate, training=self.training)
        if self.residual and layer != 0:
          h += h_list[layer]
        h_list.append(h)
      if self.JK == "last":
        node_representation = h_list[-1]
      elif self.JK == "sum":
        node_representation = 0
        for layer in range(self.num_layers + 1):
          node_representation += h_list[layer]
      return node_representation

In [4]:
class VulNet(torch.nn.Module):
  def __init__(self, grah_dim=10, n_layers=3, edge_dim=2, in_ch=172, hidden_ch=200, residual=False, drop_rate=0, JK="last", graph_pooling="sum"):
    """
    Args:
      grah_dim    output dim of graph representaion vector, when used to classify, it's classes number
      n_layers    how many node embedding layers eg.GCNConv/GINEConv
      edge_dim    dim of edge vector
      in_ch       dim of node init vector
      hidden_ch   dim of node representaion vector
      residual    adding residual connection or not. Defaults to False.
      drop_rate   dropout rate. Defaults to 0.
    """
    torch.manual_seed(12345)
    super(VulNet, self).__init__()

    self.grah_dim = grah_dim
    self.n_layers = n_layers
    self.hidden_ch = hidden_ch
    self.residual = residual
    self.drop_rate = drop_rate
    self.JK = JK

    if self.n_layers < 2:
      raise ValueError("argument 'n_layers':  Number of GNN layers must be greater than 1.")
    
    self.gnn_body = MulLayerGNN(edge_dim, in_ch, hidden_ch, n_layers, drop_rate, JK, residual)
    if graph_pooling == "sum":
      self.pool = nn.global_add_pool
    elif graph_pooling == "mean":
      self.pool = nn.global_mean_pool
    elif graph_pooling == "max":
      self.pool = nn.global_max_pool
    elif graph_pooling == "attention":
      self.pool = nn.GlobalAttention(gate_nn=torch.nn.Sequential(
                nn.Linear(hidden_ch, hidden_ch), torch.nn.BatchNorm1d(hidden_ch), torch.nn.ReLU(), nn.Linear(hidden_ch, 1)))
    elif graph_pooling == "set2set":
      self.pool = nn.Set2Set(hidden_ch, processing_steps=2)
    else:
      raise ValueError("Invalid graph pooling type.")

    if graph_pooling == "set2set":
      self.graph_linear = nn.Linear(2*self.hidden_ch, self.grah_dim)
    else:
      self.graph_linear = nn.Linear(self.hidden_ch, self.grah_dim)
  def forward(self, data):
    h_node = self.gnn_body(data)
    h_graph = self.pool(h_node, data.batch)

    return self.graph_linear(h_graph)


In [72]:
dst = MyOwnDataset("./data/smartbugs").shuffle()

In [73]:
train_n = int(0.7 * len(dst))
train_dst = dst[:train_n]
test_dst = dst[train_n:]
print(f'train samples:{len(train_dst)} | test samples:{len(test_dst)}')

train samples:45 | test samples:20


In [57]:
in_ch = train_dst[0].x.size()[1]
out_ch = 3
edge_dim=train_dst[0].edge_attr.size()[1]
train_loader = DataLoader(train_dst, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dst, batch_size=32, shuffle=True)

In [58]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [59]:
net = VulNet(grah_dim=out_ch, n_layers=2, edge_dim=edge_dim, in_ch=in_ch, hidden_ch=200, residual=True, drop_rate=0.1, JK="last",graph_pooling="sum").to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)

Epoch = 300

In [None]:
import time
import numpy as np
begin = time.time()
for epoch in range(Epoch):
    epoch_start_time = time.time()
    train_acc = 0.0
    train_loss = 0.0
    test_acc = 0.0
    test_loss = 0.0

    train_n = 0
    test_n = 0

    net.train() # 確保 model 是在 train model (開啟 Dropout 等...)
    for i, data in enumerate(train_loader):
        optimizer.zero_grad() # 用 optimizer 將 model 參數的 gradient 歸零
        train_pred = net(data.to(device)) # 利用 model 得到預測的機率分佈 這邊實際上就是去呼叫 model 的 forward 函數
        batch_loss = criterion(train_pred, data.y.to(device)) # 計算 loss （注意 prediction 跟 label 必須同時在 CPU 或是 GPU 上）
        batch_loss.backward() # 利用 back propagation 算出每個參數的 gradient
        optimizer.step() # 以 optimizer 用 gradient 更新參數值

        train_acc += np.sum(np.argmax(train_pred.data.cpu().numpy(), axis=1) == data.y.cpu().numpy())
        train_loss += batch_loss.item()
        train_n += train_pred.size()[0]
    
    net.eval()
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            test_pred = net(data.to(device))
            batch_loss = criterion(test_pred, data.y.to(device))

            test_acc += np.sum(np.argmax(test_pred.data.cpu().numpy(), axis=1) == data.y.cpu().numpy())
            test_loss += batch_loss.item()
            test_n += test_pred.size()[0]

        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, Epoch, time.time()-epoch_start_time, \
             train_acc/len(train_dst), train_loss/len(train_dst), \
             test_acc/len(test_dst), test_loss/len(test_dst)))
print("total %2.2f sec(s)" % (time.time()-begin)) 


In [69]:
for i, data in enumerate(test_loader):
    break
pred = net(data.cuda()).cpu()

In [70]:
acc = np.sum(np.argmax(pred.cpu().data.numpy(), axis=1) == data.cpu().y.numpy())
acc, acc*1.0/pred.size()[0]

(19, 0.95)

In [71]:
np.argmax(pred.cpu().data.numpy(), axis=1),data.cpu().y.numpy()

(array([2, 2, 1, 0, 2, 1, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 2, 2],
       dtype=int64),
 array([2, 2, 1, 0, 0, 1, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 2, 2],
       dtype=int64))