In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.1.0+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [3]:
import torch

# Load the dataset
loaded_dataset = torch.load('/content/drive/MyDrive/Colab_Notebooks/Datasets/reentrancy_data_list_smart_bugs.pt')

In [4]:
from torch.utils.data import random_split

# Assuming loaded_dataset is an instance of torch.utils.data.Dataset
# Specify the size of the validation set
validation_split = 0.2

# Calculate the sizes of the training and validation sets
num_data_points = len(loaded_dataset)
num_validation_points = int(validation_split * num_data_points)
num_train_points = num_data_points - num_validation_points

# Split the dataset
train_dataset, val_dataset = random_split(loaded_dataset, [num_train_points, num_validation_points])

In [6]:
len(train_dataset), len(val_dataset)

(2574, 643)

In [7]:
pos_count = 0
for data in train_dataset:
  if data.y == 1:
    pos_count += 1

print(pos_count)
print(len(train_dataset) - pos_count)

1307
1267


In [8]:
train_dataset[0]

Data(x=[177, 48], edge_index=[2, 133], y=[1])

In [9]:
train_dataset[1]

Data(x=[144, 48], edge_index=[2, 115], y=[1])

In [11]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(x=[18357, 48], edge_index=[2, 12488], y=[64], batch=[18357], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(x=[15969, 48], edge_index=[2, 11788], y=[64], batch=[15969], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(x=[20446, 48], edge_index=[2, 14017], y=[64], batch=[20446], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(x=[16650, 48], edge_index=[2, 11801], y=[64], batch=[16650], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(x=[20628, 48], edge_index=[2, 14661], y=[64], batch=[20628], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(x=[21739, 48], edge_index=[2, 14680], y=[64], batch=[21739], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(x=[20133, 48], edge_index=[2, 14116], y=[64], batch=[20133], ptr=[65])

Step 8:
Number of graphs in the current batch: 64
DataBatch(x=[19606, 48], e

In [12]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import GraphConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(48, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 2)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GraphConv(48, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [13]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [14]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y.long())  # Convert data.y to long
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     all_preds = []
     all_labels = []

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         all_preds.extend(pred.cpu().numpy())
         all_labels.extend(data.y.cpu().numpy())
     accuracy = accuracy_score(all_labels, all_preds)
     recall = recall_score(all_labels, all_preds)
     f1 = f1_score(all_labels, all_preds)

     return accuracy, recall, f1


for epoch in range(1, 30):
    train()
    train_acc, train_recall, train_f1 = test(train_loader)
    test_acc, test_recall, test_f1 = test(test_loader)
    print(f'Epoch: {epoch} ...')
    print(f'Train Acc: {train_acc:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}')
    print(f'Test Acc: {test_acc:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}')

<IPython.core.display.Javascript object>

Epoch: 1 ...
Train Acc: 0.6321, Recall: 0.8715, F1: 0.7064
Test Acc: 0.6205, Recall: 0.8547, F1: 0.6747
Epoch: 2 ...
Train Acc: 0.7040, Recall: 0.7207, F1: 0.7120
Test Acc: 0.6998, Recall: 0.7095, F1: 0.6852
Epoch: 3 ...
Train Acc: 0.7618, Recall: 0.7865, F1: 0.7703
Test Acc: 0.7481, Recall: 0.7669, F1: 0.7370
Epoch: 4 ...
Train Acc: 0.7416, Recall: 0.8462, F1: 0.7689
Test Acc: 0.7294, Recall: 0.8142, F1: 0.7348
Epoch: 5 ...
Train Acc: 0.7696, Recall: 0.7024, F1: 0.7559
Test Acc: 0.7760, Recall: 0.6959, F1: 0.7410
Epoch: 6 ...
Train Acc: 0.7786, Recall: 0.8409, F1: 0.7941
Test Acc: 0.7652, Recall: 0.8176, F1: 0.7622
Epoch: 7 ...
Train Acc: 0.7805, Recall: 0.6557, F1: 0.7521
Test Acc: 0.7823, Recall: 0.6385, F1: 0.7297
Epoch: 8 ...
Train Acc: 0.7782, Recall: 0.8845, F1: 0.8019
Test Acc: 0.7589, Recall: 0.8514, F1: 0.7648
Epoch: 9 ...
Train Acc: 0.8050, Recall: 0.8447, F1: 0.8148
Test Acc: 0.7823, Recall: 0.8176, F1: 0.7756
Epoch: 10 ...
Train Acc: 0.8124, Recall: 0.7712, F1: 0.8067
Test

In [None]:
# Save the model after training
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab_Notebooks/GNN/CWE-118_gcn_model.pth')

In [None]:
reentrancy_check = GCN(hidden_channels=64)

In [None]:
reentrancy_check.load_state_dict(torch.load('/content/drive/MyDrive/Colab_Notebooks/GNN/reentrancy_gcn_model.pth'))

<All keys matched successfully>