# MANDO

In [25]:
import pandas as pd
data = pd.read_csv("/Users/user01/fahim/icsme/train_label_dataset.csv")



import re
def remo(code):
    # Check if input is a string
    if not isinstance(code, str):
        return code
        
    code = re.sub(r'/\.?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
    code = re.sub(r'^\s*[\n\r]', '', code, flags=re.MULTILINE)
    return code.strip()

# Apply the function to the 'func' column (or whatever your code column is named)
data['functionSource'] = data['functionSource'].apply(remo)
data.reset_index(drop=True, inplace=True)





code_samples = data['functionSource']

In [27]:
data

Unnamed: 0,functionSource,label,numeric
0,"ng_mix_init(struct ng_devstate *dev, char *dev...",CWE-other,4
1,"execdotcmd(const char *cmd, char *defcmd, cons...",CWE-476,3
2,"setBlockIndent(QTextBlock block, int indent)\n...",CWE-119,0
3,efi_snp_notify ( struct net_device *netdev ) {...,CWE-119,0
4,"dir_ctrl(X509_LOOKUP *ctx, int cmd, const char...",CWE-other,4
...,...,...,...
17995,"buffer_put(buffer_t *b, const void *p, size_t ...",CWE-120,1
17996,"_giraffe_model_fit(GiModel *self, cpl_matrix *...",CWE-476,3
17997,gdm_server_spawn_5731 (GdmServer_8582 *serv...,CWE-469,2
17998,"main(int argc, char *argv[]) {\r\n\tWEATHERSTA...",CWE-other,4


In [29]:
import torch
import torch.nn as nn
from torch_geometric.data import HeteroData
from torch_geometric.nn import HGTConv
from torch_geometric.transforms import ToUndirected
import numpy as np
import pandas as pd

def generate_heterogeneous_graph(code_sample):
    data = HeteroData()
    num_lines = 10
    num_funcs = 5
    data['line'].x = torch.randn((num_lines, 64))
    data['func'].x = torch.randn((num_funcs, 64))
    data['line', 'NEXT', 'line'].edge_index = torch.tensor(
        [[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dtype=torch.long
    )
    data['func', 'CALLS', 'func'].edge_index = torch.tensor(
        [[0, 1, 2], [1, 2, 3]], dtype=torch.long
    )
    data['func', 'CONTAINS', 'line'].edge_index = torch.tensor(
        [[0, 0, 1, 2], [0, 1, 2, 3]], dtype=torch.long
    )
    ToUndirected()(data)
    return data

class HeterogeneousGraphEmbeddingModel(nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads):
        super(HeterogeneousGraphEmbeddingModel, self).__init__()
        self.conv1 = HGTConv(
            in_channels=64,
            out_channels=hidden_channels,
            metadata=(['line', 'func'], [('line', 'NEXT', 'line'),
                                         ('func', 'CALLS', 'func'),
                                         ('func', 'CONTAINS', 'line')]),
            heads=num_heads
        )
        self.conv2 = HGTConv(
            in_channels=hidden_channels,
            out_channels=out_channels,
            metadata=(['line', 'func'], [('line', 'NEXT', 'line'),
                                         ('func', 'CALLS', 'func'),
                                         ('func', 'CONTAINS', 'line')]),
            heads=num_heads
        )

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: torch.relu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

hidden_channels = 64
out_channels = 128
num_heads = 8

model = HeterogeneousGraphEmbeddingModel(hidden_channels, out_channels, num_heads)
model.eval()


embeddings = []

for code_sample in code_samples:
    graph_data = generate_heterogeneous_graph(code_sample)
    with torch.no_grad():
        embedding_dict = model(graph_data)
        func_embedding = torch.mean(embedding_dict['func'], dim=0).cpu().numpy()
        line_embedding = torch.mean(embedding_dict['line'], dim=0).cpu().numpy()
        combined_embedding = np.concatenate([func_embedding, line_embedding])
        embeddings.append(combined_embedding)

embeddings_df = pd.DataFrame(embeddings)
embeddings_df.columns = [f"mango{i+1}" for i in range(embeddings_df.shape[1])]




In [31]:

embeddings_df['label'] = data['numeric']


train= embeddings_df

In [21]:

embeddings_df['label'] = data['numeric']


test= embeddings_df

### MANDO classification

In [33]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, cohen_kappa_score, mean_squared_error, mean_absolute_error, roc_auc_score
import numpy as np
import pandas as pd

class MANDOClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate=0.5):
        super(MANDOClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


trai = train
ts = test
X_train = trai.iloc[:, :-1].values
y_train = trai.iloc[:, -1].values
X_test = ts.iloc[:, :-1].values
y_test = ts.iloc[:, -1].values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

input_dim = X_train.shape[1]
hidden_dim = 128
num_classes = len(np.unique(y_train))
dropout_rate = 0.5
learning_rate = 0.001
epochs = 50
batch_size = 32


model = MANDOClassifier(input_dim, hidden_dim, num_classes, dropout_rate)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, y_pred = torch.max(outputs, 1)

    accuracy = accuracy_score(y_test, y_pred.numpy())
    class_report = classification_report(y_test, y_pred.numpy())
    conf_matrix = confusion_matrix(y_test, y_pred.numpy())
    precision = precision_score(y_test, y_pred.numpy(), average='weighted')
    recall = recall_score(y_test, y_pred.numpy(), average='weighted')
    f1 = f1_score(y_test, y_pred.numpy(), average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred.numpy())
    kappa = cohen_kappa_score(y_test, y_pred.numpy())
    mse = mean_squared_error(y_test, y_pred.numpy())
    mae = mean_absolute_error(y_test, y_pred.numpy())

    y_pred_prob = model(X_test_tensor).softmax(dim=1).numpy()
    roc_auc = roc_auc_score(pd.get_dummies(y_test), y_pred_prob, average='macro', multi_class='ovr')

    torch.save(model.state_dict(), 'mando_CGN.pth')


    print("Accuracy:", accuracy)
    print("\nClassification Report:\n", class_report)
    print("\nConfusion Matrix:\n", conf_matrix)
    print("\nPrecision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Matthews Correlation Coefficient:", mcc)
    print("Cohen's Kappa Score:", kappa)
    print("ROC AUC Score:", roc_auc)
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)


Accuracy: 0.20755555555555555

Classification Report:
               precision    recall  f1-score   support

           0       0.21      1.00      0.34       934
           1       0.00      0.00      0.00       860
           2       0.00      0.00      0.00       918
           3       0.00      0.00      0.00       909
           4       0.00      0.00      0.00       879

    accuracy                           0.21      4500
   macro avg       0.04      0.20      0.07      4500
weighted avg       0.04      0.21      0.07      4500


Confusion Matrix:
 [[934   0   0   0   0]
 [860   0   0   0   0]
 [918   0   0   0   0]
 [909   0   0   0   0]
 [879   0   0   0   0]]

Precision: 0.0430793086419753
Recall: 0.20755555555555555
F1 Score: 0.0713496094548726
Matthews Correlation Coefficient: 0.0
Cohen's Kappa Score: 0.0
ROC AUC Score: 0.4956019363093439
Mean Squared Error (MSE): 5.950444444444445
Mean Absolute Error (MAE): 1.9864444444444445


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred.numpy())

# Compute total TP, FP, FN, TN
TP = np.sum(np.diag(conf_matrix))  # Sum of diagonal (True Positives)
FP = np.sum(conf_matrix) - np.sum(np.diag(conf_matrix)) - np.sum(conf_matrix.sum(axis=1) - np.diag(conf_matrix))  # Total FP
FN = np.sum(conf_matrix.sum(axis=1) - np.diag(conf_matrix))  # Total FN
TN = np.sum(conf_matrix) - (TP + FP + FN)  # Total TN

# Compute Sensitivity (SN) and Specificity (SP)
SN = TP / (TP + FN)  # Sensitivity / Recall
SP = TN / (TN + FP)  # Specificity

# Print results
print(f"\nTrue Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"True Negatives (TN): {TN}")

print(f"\nSensitivity (SN): {SN:.4f}")
print(f"Specificity (SP): {SP:.4f}")



True Positives (TP): 934
False Positives (FP): 0
False Negatives (FN): 3566
True Negatives (TN): 0

Sensitivity (SN): 0.2076
Specificity (SP): nan


  SP = TN / (TN + FP)  # Specificity
