## BGGNN4



In [1]:
pip install torch_geometric

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# embedders

In [29]:
import networkx as nx
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import numpy as np
from torch.optim import Adamax
import pandas as pd

def generate_ast(cpp_code):
    ast = nx.DiGraph()
    ast.add_nodes_from(["root", "node1", "node2"])
    ast.add_edges_from([("root", "node1"), ("root", "node2")])
    return ast

def generate_cfg(cpp_code):
    cfg = nx.DiGraph()
    cfg.add_nodes_from(["start", "if", "else", "end"])
    cfg.add_edges_from([("start", "if"), ("if", "else"), ("else", "end"), ("if", "end")])
    return cfg

def generate_dfg(cpp_code):
    dfg = nx.DiGraph()
    dfg.add_nodes_from(["var1", "var2", "var3"])
    dfg.add_edges_from([("var1", "var2"), ("var2", "var3")])
    return dfg

def combine_graphs(ast, cfg, dfg):
    combined_graph = nx.compose(ast, cfg)
    combined_graph = nx.compose(combined_graph, dfg)
    return combined_graph

class CNN_GNN_Model(nn.Module):
    def __init__(self, in_channels, out_channels, filter_size=3, hidden_size=200, dropout_rate=0.2):
        super(CNN_GNN_Model, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, hidden_size, kernel_size=filter_size, stride=1, padding=filter_size//2)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_size, out_channels)
        self.optimizer = Adamax(params=self.parameters(), lr=0.00015)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x.unsqueeze(2).transpose(1, 2))
        x = self.relu(x)
        if x.size(2) > 1:
            x = self.pool(x)
        else:
            print("\n")
        x = self.dropout(x)
        x = x.squeeze(2)
        x = self.fc1(x)
        return x

def convert_graph_to_data(graph):
    edge_index = []
    node_features = []
    node_mapping = {node: idx for idx, node in enumerate(graph.nodes())}
    for node1, node2 in graph.edges():
        edge_index.append([node_mapping[node1], node_mapping[node2]])
    node_features = np.random.rand(len(graph.nodes()), 1)
    node_features = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    data = Data(x=node_features, edge_index=edge_index)
    return data

def process_cpp_code(cpp_code):
    ast = generate_ast(cpp_code)
    cfg = generate_cfg(cpp_code)
    dfg = generate_dfg(cpp_code)
    combined_graph = combine_graphs(ast, cfg, dfg)
    data = convert_graph_to_data(combined_graph)
    model = CNN_GNN_Model(in_channels=1, out_channels=128, filter_size=3, hidden_size=200, dropout_rate=0.2)
    embedding = model(data)
    return embedding





In [57]:

import pandas as pd

import re
def remo(code):
    # Check if input is a string
    if not isinstance(code, str):
        return code
        
    code = re.sub(r'/\.?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
    code = re.sub(r'^\s*[\n\r]', '', code, flags=re.MULTILINE)
    return code.strip()

# Apply the function to the 'func' column (or whatever your code column is named)

train =pd.read_csv('/Users/akter/Downloads/MSR update/MSR update/ICSME version/Train-Test daatset/Splited dataset/train_label_dataset.csv')
test =pd.read_csv('/Users/akter/Downloads/MSR update/MSR update/ICSME version/Train-Test daatset/Splited dataset/test_label_dataset.csv')
train['functionSource'] = train['functionSource'].apply(remo)
test['functionSource'] = test['functionSource'].apply(remo)

train = train[['functionSource', 'numeric']]
test = test[['functionSource', 'numeric']]


train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)




embedding_vectors = []

for code_sample in train['functionSource']:
    embedding_vector = process_cpp_code(code_sample)
    embedding_vectors.append(embedding_vector.detach().numpy())


embedding_vectors = np.array(embedding_vectors)

embedding_vectors_avg = np.mean(embedding_vectors, axis=1)

embedding_df = pd.DataFrame(embedding_vectors_avg, columns=[f'BGNN{i+1}' for i in range(embedding_vectors_avg.shape[1])])













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [58]:
trainf = embedding_df
trainf['label'] = train['numeric']

In [61]:


embedding_vectors = []

for code_sample in test['functionSource']:
    embedding_vector = process_cpp_code(code_sample)
    embedding_vectors.append(embedding_vector.detach().numpy())


embedding_vectors = np.array(embedding_vectors)

embedding_vectors_avg = np.mean(embedding_vectors, axis=1)

embedding_df = pd.DataFrame(embedding_vectors_avg, columns=[f'BGNN{i+1}' for i in range(embedding_vectors_avg.shape[1])])
testf = embedding_df

testf['label'] = test['numeric']











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [69]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    trainf.drop(columns=['label']), trainf['label'], test_size=0.2, random_state=42, stratify=trainf['label']
)

X_test = testf.drop(columns=['label'])
y_test = testf['label']

In [83]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
    matthews_corrcoef, cohen_kappa_score, mean_squared_error, mean_absolute_error, 
    confusion_matrix
)

class VulnerabilityClassifier(nn.Module):
    def __init__(self, input_size, num_classes=5):
        super(VulnerabilityClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

def train(model, train_loader, epochs=50, lr=0.00015):
    optimizer = torch.optim.Adamax(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0  # Initialize total_loss before accumulating
        
        for batch in train_loader:
            optimizer.zero_grad()
            out = model(batch[0])  # Only pass the feature tensor
            loss = criterion(out, batch[1])  # batch[1] is the label tensor
            loss.backward()
            optimizer.step()
            total_loss += loss.item()  # Accumulate loss

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")



def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            out = model(batch[0])  # Pass only features
            preds = out.argmax(dim=1).cpu().numpy()
            true_labels.extend(batch[1].cpu().numpy())  # batch[1] contains labels
            predictions.extend(preds)
    return true_labels, predictions

def compute_metrics(y_test, final_predictions, model, X_test):
    accuracy = accuracy_score(y_test, final_predictions)
    precision = precision_score(y_test, final_predictions, average='macro')
    recall = recall_score(y_test, final_predictions, average='macro')
    f1 = f1_score(y_test, final_predictions, average='macro')
    auc = roc_auc_score(y_test, model(X_test).softmax(dim=1).detach().cpu().numpy(), multi_class='ovr')

    mcc = matthews_corrcoef(y_test, final_predictions)
    kappa = cohen_kappa_score(y_test, final_predictions)
    mse = mean_squared_error(y_test, final_predictions)
    mae = mean_absolute_error(y_test, final_predictions)
    conf_matrix = confusion_matrix(y_test, final_predictions)

    # Handle confusion matrix values for multi-class
    if conf_matrix.shape == (2, 2): 
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    else:
        specificity, sensitivity = 0, 0  # Not applicable for multi-class

    print(f"AUC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"Kappa: {kappa:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Specificity (SP): {specificity:.4f}")
    print(f"Sensitivity (SN): {sensitivity:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'mcc': mcc,
        'kappa': kappa,
        'mse': mse,
        'mae': mae,
        'specificity': specificity,
        'sensitivity': sensitivity,
        'conf_matrix': conf_matrix
    }

# Convert dataset to tensors
X_train_tensor = torch.tensor(trainf.iloc[:, :-1].values, dtype=torch.float32)
y_train_tensor = torch.tensor(trainf.iloc[:, -1].values, dtype=torch.long)
X_test_tensor = torch.tensor(testf.iloc[:, :-1].values, dtype=torch.float32)
y_test_tensor = torch.tensor(testf.iloc[:, -1].values, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model
input_size = X_train_tensor.shape[1]  
model = VulnerabilityClassifier(input_size, num_classes=5)

# Train model
train(model, train_loader)

# Evaluate model
true_labels, predictions = evaluate(model, test_loader)

# Compute metrics
metrics = compute_metrics(true_labels, predictions, model, X_test_tensor)


Epoch 1/50, Loss: 906.3503
Epoch 2/50, Loss: 905.8846
Epoch 3/50, Loss: 905.7350
Epoch 4/50, Loss: 905.2688
Epoch 5/50, Loss: 905.1337
Epoch 6/50, Loss: 904.7036
Epoch 7/50, Loss: 904.1929
Epoch 8/50, Loss: 903.7684
Epoch 9/50, Loss: 903.2058
Epoch 10/50, Loss: 902.4174
Epoch 11/50, Loss: 901.7739
Epoch 12/50, Loss: 901.0062
Epoch 13/50, Loss: 900.2687
Epoch 14/50, Loss: 899.5244
Epoch 15/50, Loss: 898.4118
Epoch 16/50, Loss: 897.3866
Epoch 17/50, Loss: 896.5391
Epoch 18/50, Loss: 895.5735
Epoch 19/50, Loss: 894.5537
Epoch 20/50, Loss: 892.8712
Epoch 21/50, Loss: 892.2434
Epoch 22/50, Loss: 890.7310
Epoch 23/50, Loss: 889.8735
Epoch 24/50, Loss: 888.7919
Epoch 25/50, Loss: 887.8616
Epoch 26/50, Loss: 886.5370
Epoch 27/50, Loss: 885.3039
Epoch 28/50, Loss: 883.9505
Epoch 29/50, Loss: 882.2047
Epoch 30/50, Loss: 880.7568
Epoch 31/50, Loss: 880.2422
Epoch 32/50, Loss: 878.9440
Epoch 33/50, Loss: 877.6307
Epoch 34/50, Loss: 876.0091
Epoch 35/50, Loss: 873.5388
Epoch 36/50, Loss: 872.4475
E