This notebook shows the steps of extracting ASTs, building the dataset on Pytorch Geometric and then applying GNN model for graph embedding as well as predicting

In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm
from torch_geometric.data import Data
from sklearn.metrics import confusion_matrix, f1_score, \
    accuracy_score, precision_score, recall_score
from torch_geometric.data import Dataset

In [None]:
import clang.cindex
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font',**{'family':'serif','serif':['Palatino'], 'size'   : 24})
rc('text', usetex=True)

In [None]:
import matplotlib
import matplotlib.font_manager as fm

Uploading a processed CWE type:

In [None]:
vdisc = pd.read_csv("/content/vdisc_CWE_469.csv.gz")
vdisc["bug"] = vdisc["bug"].astype(int)


FileNotFoundError: [Errno 2] No such file or directory: '/content/vdisc_CWE_469.csv.gz'

In [None]:
vdisc.info()

AttributeError: 'DatasetDict' object has no attribute 'info'

Extracting AST

In [None]:
def save_ast(node):

    node.children = list(node.get_children())

    for child in node.children:
        counter = save_ast(child)

In [None]:
def numbering_ast_nodes(node, counter=1):

    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

In [None]:
def generate_edgelist(ast_root):

    edges = [[],[]]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            # edges.append([node.identifier, child.identifier])
            # walk_tree_and_add_edges(child)
            edg_0 = (node.identifier)-1
            edg_1 = (child.identifier)-1
            # edges[0].append(node.identifier)
            # edges[1].append(child.identifier)
            edges[0].append(edg_0)
            edges[1].append(edg_1)
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)
    return  torch.tensor(edges, dtype=torch.long)

In [None]:
def generate_features(ast_root):

    features = []

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        #in_degree = 1
        #degree = out_degree + in_degree
        degree = out_degree
        node_id = node.identifier
        features.append([node_id, degree])

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    features_array = np.asarray(features)
    # nodes_tensor = torch.from_numpy(features_array).float()
    nodes_tensor = torch.tensor(features_array, dtype=torch.float)
    # nodes_tensor = torch.LongTensor(features).unsqueeze(1)
    return nodes_tensor

In [None]:
def clang_process(testcase, **kwargs):

    parse_list = [
        (testcase.filename, testcase.code)

    ]

    # source_file= get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=testcase.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    graphs_embedding = generate_edgelist(ast_root)

    nodes_embedding = generate_features(ast_root)


    y = torch.tensor([testcase.bug], dtype=torch.int64)



    # delete clang objects
    del translation_unit
    del ast_root
    del index

    return Data(x=nodes_embedding, edge_index=graphs_embedding, y=y)

Building the dataset on Pytorch geometrics

In [None]:
class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return 'not_implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        pass

    def process(self):
        self.data = pd.read_csv("/content/vdisc_CWE_469.csv.gz")
        for index, vuln in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            data = clang_process(vuln)
            torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir,
                                 f'data_{idx}.pt'))
        return data

In [None]:
dataset = MyOwnDataset(root='/content/')

Processing...
100%|██████████| 5250/5250 [00:32<00:00, 161.87it/s]
Done!


In [None]:
len(dataset)

5250

In [None]:
print(f'Number of features: {dataset.num_features}')

Number of features: 2


  data = torch.load(os.path.join(self.processed_dir,


In [None]:
data1 = dataset[2]  # Get the first graph object.
print(f'Number of nodes: {data1.num_nodes}')
print(f'Number of edges: {data1.num_edges}')

Number of nodes: 115
Number of edges: 114


  data = torch.load(os.path.join(self.processed_dir,


In [None]:
print(dataset[2].edge_index.t())

tensor([[  0,   1],
        [  1,   2],
        [  2,   3],
        [  1,   4],
        [  4,   5],
        [  1,   6],
        [  6,   7],
        [  7,   8],
        [  7,   9],
        [  9,  10],
        [  6,  11],
        [ 11,  12],
        [ 11,  13],
        [ 13,  14],
        [ 14,  15],
        [ 15,  16],
        [  6,  17],
        [ 17,  18],
        [ 18,  19],
        [ 19,  20],
        [ 19,  21],
        [ 21,  22],
        [ 19,  23],
        [ 18,  24],
        [ 17,  25],
        [  6,  26],
        [ 26,  27],
        [ 27,  28],
        [ 26,  29],
        [ 29,  30],
        [ 29,  31],
        [ 31,  32],
        [  6,  33],
        [ 33,  34],
        [ 33,  35],
        [ 35,  36],
        [ 36,  37],
        [ 36,  38],
        [ 36,  39],
        [ 36,  40],
        [ 40,  41],
        [  6,  42],
        [ 42,  43],
        [ 43,  44],
        [ 44,  45],
        [ 44,  46],
        [ 46,  47],
        [ 44,  48],
        [ 43,  49],
        [ 42,  50],


  data = torch.load(os.path.join(self.processed_dir,


In [None]:
print(dataset[2].x)

tensor([[  1.,   1.],
        [  2.,   3.],
        [  3.,   1.],
        [  4.,   0.],
        [  5.,   1.],
        [  6.,   0.],
        [  7.,   6.],
        [  8.,   2.],
        [  9.,   0.],
        [ 10.,   1.],
        [ 11.,   0.],
        [ 12.,   2.],
        [ 13.,   0.],
        [ 14.,   1.],
        [ 15.,   1.],
        [ 16.,   1.],
        [ 17.,   0.],
        [ 18.,   2.],
        [ 19.,   2.],
        [ 20.,   3.],
        [ 21.,   0.],
        [ 22.,   1.],
        [ 23.,   0.],
        [ 24.,   0.],
        [ 25.,   0.],
        [ 26.,   0.],
        [ 27.,   2.],
        [ 28.,   1.],
        [ 29.,   0.],
        [ 30.,   2.],
        [ 31.,   0.],
        [ 32.,   1.],
        [ 33.,   0.],
        [ 34.,   2.],
        [ 35.,   0.],
        [ 36.,   1.],
        [ 37.,   4.],
        [ 38.,   0.],
        [ 39.,   0.],
        [ 40.,   0.],
        [ 41.,   1.],
        [ 42.,   0.],
        [ 43.,   3.],
        [ 44.,   2.],
        [ 45.,   3.],
        [ 

  data = torch.load(os.path.join(self.processed_dir,


In [None]:
print(dataset[2].y)

tensor([1])


  data = torch.load(os.path.join(self.processed_dir,


Split up the dataset

In [None]:
dataset = dataset.shuffle()
#one_tenth_length = int(len(dataset) * 0.1)
one_tenth_length = int(len(dataset) * 0.1)
train_dataset = dataset[:one_tenth_length * 8]
val_dataset = dataset[one_tenth_length*8:one_tenth_length * 9]
test_dataset = dataset[one_tenth_length*9:]
#test_dataset = dataset[one_tenth_length*8:one_tenth_length * 10]
len(train_dataset), len(val_dataset), len(test_dataset)
#len(train_dataset), len(test_dataset)

(4200, 525, 525)

In [None]:
from torch_geometric.data import DataLoader
NUM_GRAPHS_PER_BATCH = 256
train_loader = DataLoader(train_dataset, batch_size=NUM_GRAPHS_PER_BATCH,drop_last=True, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=NUM_GRAPHS_PER_BATCH,drop_last=True, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=NUM_GRAPHS_PER_BATCH,drop_last=True, shuffle=True)



Building the GNN model

In [None]:
import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool, TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
embedding_size = 128
class GCN(torch.nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        self.initial_conv = GCNConv(dataset.num_features, embedding_size) #to  translate our node features into the size of the embedding
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        # pooling layer
        #self.pool = TopKPooling(embedding_size, ratio=0.8)
        #dropout layer
        #self.dropout = Dropout(p=0.2)

        # Output layer
        self.lin1 = Linear(embedding_size*2, 128) # linear output layer ensures that we get a continuous unbounded output value. It input is the flattened vector (embedding size *2) from the pooling layer (mean and max)
        self.lin2 = Linear(128, 128)
        self.lin3 = Linear(128, 1)

        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()

    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)

        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        #hidden = self.dropout(hidden)
        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index),
                            gap(hidden, batch_index)], dim=1)
        # Apply a final (linear) classifier.
        out = self.lin1(hidden)
        out = self.act1(out)
        out = self.lin2(out)
        out = self.act2(out)
        #out = F.dropout(out, p=0.5, training=self.training)
        out = self.lin3(out)
        out = torch.sigmoid(out)

        # return out, hidden
        return out

model = GCN()
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

GCN(
  (initial_conv): GCNConv(2, 128)
  (conv1): GCNConv(128, 128)
  (conv2): GCNConv(128, 128)
  (lin1): Linear(in_features=256, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=128, bias=True)
  (lin3): Linear(in_features=128, out_features=1, bias=True)
  (act1): ReLU()
  (act2): ReLU()
)
Number of parameters:  82945


  data = torch.load(os.path.join(self.processed_dir,


In [None]:
def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x.float(), data.edge_index, data.batch)
        label = data.y.to(device)
        #loss = torch.sqrt(loss_fn(output, label))
        loss = loss_fn(output.squeeze(), label.float())
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

In [None]:
from sklearn.metrics import roc_auc_score
def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            # pred = model(data.x.float(), data.edge_index, data.batch).detach().cpu().numpy()
            pred = model(data.x.float(), data.edge_index, data.batch)
            label_true = data.y.to(device)
            label = data.y.detach().cpu().numpy()
            # predictions.append(pred)
            # labels.append(label)
            predictions.append(np.rint(pred.cpu().detach().numpy()))
            labels.append(label)
            loss = loss_fn(pred.squeeze(), label_true.float())
    # predictions = np.hstack(predictions)
    # labels = np.hstack(labels)
    predictions = np.concatenate(predictions).ravel()
    labels = np.concatenate(labels).ravel()

    # print(predictions)
    # print(labels)
    return accuracy_score(labels, predictions), loss


Training the model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn = torch.nn.BCELoss()


print("Starting training...")
train_losses = []
val_losses = []
val_acc_list= []
train_acc_list= []
best_loss = 1000
early_stopping_counter = 0
for epoch in range(200):
    if early_stopping_counter <=  10: # = x * 5
        loss = train()
        train_losses.append(loss)
        train_acc, train_loss = evaluate(train_loader)
        #val_acc = evaluate(val_loader)
        val_acc, val_loss = evaluate(val_loader)
        val_losses.append(val_loss)
        val_acc_list.append(val_acc)
        train_acc_list.append(train_acc)

        if float(val_loss) < best_loss:
            best_loss = val_loss
            # Save the currently best model
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
        print(f"Epoch {epoch} | Train Loss {loss} | Train Accuracy{train_acc} | Validation Accuracy{val_acc} | Validation loss{val_loss}")

    else:
        print("Early stopping due to no improvement.")
        break
print(f"Finishing training with best val loss: {best_loss}")

  data = torch.load(os.path.join(self.processed_dir,


Starting training...


  data = torch.load(os.path.join(self.processed_dir,


Epoch 0 | Train Loss 0.7138057926722935 | Train Accuracy0.699951171875 | Validation Accuracy0.7265625 | Validation loss0.6457016468048096


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 1 | Train Loss 0.5820643615722656 | Train Accuracy0.697265625 | Validation Accuracy0.6953125 | Validation loss0.5481016039848328


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 2 | Train Loss 0.5687667011079334 | Train Accuracy0.691162109375 | Validation Accuracy0.73046875 | Validation loss0.5604127645492554


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 3 | Train Loss 0.5672528875441778 | Train Accuracy0.7119140625 | Validation Accuracy0.71875 | Validation loss0.587308406829834


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 4 | Train Loss 0.5643009149460565 | Train Accuracy0.703369140625 | Validation Accuracy0.7109375 | Validation loss0.5838343501091003


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 5 | Train Loss 0.5614966110956101 | Train Accuracy0.700439453125 | Validation Accuracy0.720703125 | Validation loss0.5607606768608093


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 6 | Train Loss 0.5607733190627325 | Train Accuracy0.701171875 | Validation Accuracy0.697265625 | Validation loss0.599475622177124


  data = torch.load(os.path.join(self.processed_dir,
  data = torch.load(os.path.join(self.processed_dir,


Epoch 7 | Train Loss 0.5630459376743862 | Train Accuracy0.7001953125 | Validation Accuracy0.69140625 | Validation loss0.5808665752410889
Early stopping due to no improvement.
Finishing training with best val loss: 0.5481016039848328


Plotting the learning curves

Printing out the performance metrics

In [None]:
NUM_GRAPHS_PER_BATCH_1 = 4835
test_loader_all = DataLoader(test_dataset, batch_size=NUM_GRAPHS_PER_BATCH_1,drop_last=True, shuffle=True)

In [None]:
# Analyze the results for all graphs
test_batch = next(iter(test_loader_all))
with torch.no_grad():
    test_batch.to(device)
    pred = model(test_batch.x.float(), test_batch.edge_index, test_batch.batch)
    accuracy= accuracy_score(test_batch.y, np.rint(pred))
    precision= precision_score(test_batch.y, np.rint(pred), zero_division=1)
    recall= recall_score(test_batch.y, np.rint(pred), zero_division=1)
    df = pd.DataFrame()
    df["y_real"] = test_batch.y.tolist()
    df["y_pred"] = pred.tolist()
    print(f"\n Confusion matrix: \n {confusion_matrix(test_batch.y, np.rint(pred))}")
    print(f"\n Accuracy: {accuracy_score(test_batch.y, np.rint(pred))}")
    print(f"\n Precision: {precision_score(test_batch.y, np.rint(pred))}")
    print(f"\n Recall: {recall_score(test_batch.y, np.rint(pred))}")
    print(f"\n F1 Score: {f1_score(test_batch.y, np.rint(pred))}")
df


 Confusion matrix: 
 [[1620  844]
 [ 461 1910]]

 Accuracy: 0.7300930713547052

 Precision: 0.6935366739288308

 Recall: 0.8055672711935892

 F1 Score: 0.7453658536585365


Unnamed: 0,y_real,y_pred
0,1,[0.4110952913761139]
1,1,[0.6756953597068787]
2,0,[0.11998370289802551]
3,0,[0.7369117140769958]
4,0,[0.13293148577213287]
...,...,...
4830,1,[0.12243560701608658]
4831,1,[0.8020007014274597]
4832,0,[0.5188013911247253]
4833,1,[0.6873726844787598]


In [None]:
import torch

# Save model
torch.save(model.state_dict(), "gnn_model.pth")

# Load model in deployment script
model.load_state_dict(torch.load("gnn_model.pth"))
model.eval()


In [4]:
!pip install streamlit



In [None]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.126.137.109:8501[0m
[0m
