In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef, cohen_kappa_score
)
import pandas as pd
import numpy as np

EMBED_DIM = 128
MAX_PATHS = 100
DROPOUT_RATE = 0.1
LEARNING_RATE = 0.002
BATCH_SIZE = 64
EPOCHS = 20
NUM_CLASSES = 5  # Updated for multi-class classification

def ast_tokenize(code_snippet: str, max_seq_len: int = 50) -> torch.Tensor:
    tokens = np.random.rand(max_seq_len, EMBED_DIM)
    return torch.tensor(tokens, dtype=torch.float)

class CodeSnippetDataset(Dataset):
    def __init__(self, dataframe):
        self.code_snippets = dataframe['functionSource'].tolist()
        self.labels = dataframe['numeric'].tolist()
    
    def __len__(self):
        return len(self.code_snippets)
    
    def __getitem__(self, idx):
        return self.code_snippets[idx], self.labels[idx]

class ValueFlowPathEncoder(nn.Module):
    def __init__(self, embed_dim):
        super(ValueFlowPathEncoder, self).__init__()
        self.local_encoder = nn.LSTM(embed_dim, embed_dim, bidirectional=True, batch_first=True)
        self.global_encoder = nn.LSTM(embed_dim * 2, embed_dim, batch_first=True)
        self.attention = nn.Linear(embed_dim, 1)
    
    def forward(self, x):
        local_features, _ = self.local_encoder(x)
        global_features, _ = self.global_encoder(local_features)
        attn_weights = torch.softmax(self.attention(global_features), dim=1)
        context = (attn_weights * global_features).sum(dim=1)
        return context

class TransformerBasedDetector(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, num_classes):
        super(TransformerBasedDetector, self).__init__()
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, batch_first=True)
        self.attention = nn.Linear(embed_dim, 1)
        self.classifier = nn.Linear(embed_dim, num_classes)  # Updated for multi-class
    
    def forward(self, x):
        features = self.transformer(x, x)
        attn_weights = torch.softmax(self.attention(features), dim=1)
        context = (attn_weights * features).sum(dim=1)
        logits = self.classifier(context)
        return logits





import pandas as pd

import re
def remo(code):
    # Check if input is a string
    if not isinstance(code, str):
        return code
        
    code = re.sub(r'/\.?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
    code = re.sub(r'^\s*[\n\r]', '', code, flags=re.MULTILINE)
    return code.strip()

# Apply the function to the 'func' column (or whatever your code column is named)

train = pd.read_csv("/Users/user01/fahim/icsme/train_label_dataset.csv")
test = pd.read_csv("/Users/user01/fahim/icsme/test_label_dataset.csv")
train['functionSource'] = train['functionSource'].apply(remo)
test['functionSource'] = test['functionSource'].apply(remo)

train = train[['functionSource', 'numeric']]
test = test[['functionSource', 'numeric']]


train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)




train_dataset = CodeSnippetDataset(train)
test_dataset = CodeSnippetDataset(test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = ValueFlowPathEncoder(EMBED_DIM).to(device)
contrastive_loss = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(encoder.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    encoder.train()
    for code_snippets, _ in train_loader:
        code_snippets = [ast_tokenize(snippet) for snippet in code_snippets]
        code_snippets = torch.stack(code_snippets).to(device)
        z_i, z_j = encoder(code_snippets), encoder(code_snippets)
        target = torch.ones(z_i.size(0)).to(device)
        loss = contrastive_loss(z_i, z_j, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

detector = TransformerBasedDetector(EMBED_DIM, num_heads=8, num_layers=2, num_classes=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(detector.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    detector.train()
    for code_snippets, labels in train_loader:
        code_snippets = [ast_tokenize(snippet) for snippet in code_snippets]
        code_snippets = torch.stack(code_snippets).to(device)
        labels = torch.tensor(labels, dtype=torch.long).to(device)
        embeddings = encoder(code_snippets)
        logits = detector(embeddings.unsqueeze(1))
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

detector.eval()
encoder.eval()
Y_pred, Y_true = [], []

with torch.no_grad():
    for code_snippets, labels in test_loader:
        code_snippets = [ast_tokenize(snippet) for snippet in code_snippets]
        code_snippets = torch.stack(code_snippets).to(device)
        labels = torch.tensor(labels, dtype=torch.long).to(device)
        embeddings = encoder(code_snippets)
        logits = detector(embeddings.unsqueeze(1))
        predictions = logits.argmax(dim=1)
        Y_pred.extend(predictions.cpu().numpy())
        Y_true.extend(labels.cpu().numpy())

print("Classification Report:")
print(classification_report(Y_true, Y_pred, zero_division=0))

conf_matrix = confusion_matrix(Y_true, Y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

accuracy = accuracy_score(Y_true, Y_pred)
precision_macro = precision_score(Y_true, Y_pred, average="macro", zero_division=0)
recall_macro = recall_score(Y_true, Y_pred, average="macro", zero_division=0)
f1_macro = f1_score(Y_true, Y_pred, average="macro", zero_division=0)
precision_weighted = precision_score(Y_true, Y_pred, average="weighted", zero_division=0)
recall_weighted = recall_score(Y_true, Y_pred, average="weighted", zero_division=0)
f1_weighted = f1_score(Y_true, Y_pred, average="weighted", zero_division=0)
mcc = matthews_corrcoef(Y_true, Y_pred)
kappa = cohen_kappa_score(Y_true, Y_pred)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Precision (Weighted): {precision_weighted:.4f}")
print(f"Recall (Weighted): {recall_weighted:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
print(f"Matthews Correlation Coefficient: {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")


  labels = torch.tensor(labels, dtype=torch.long).to(device)
  labels = torch.tensor(labels, dtype=torch.long).to(device)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4053
           1       0.49      1.00      0.66      3947

    accuracy                           0.49      8000
   macro avg       0.25      0.50      0.33      8000
weighted avg       0.24      0.49      0.33      8000


Confusion Matrix:
[[   0 4053]
 [   0 3947]]

Accuracy: 0.4934
Precision (Macro): 0.2467
Recall (Macro): 0.5000
F1 Score (Macro): 0.3304
Precision (Weighted): 0.2434
Recall (Weighted): 0.4934
F1 Score (Weighted): 0.3260
Matthews Correlation Coefficient: 0.0000
Cohen's Kappa: 0.0000


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute confusion matrix
conf_matrix = confusion_matrix(Y_true, Y_pred)

# Overall TP, FP, FN, TN
TP = np.diag(conf_matrix).sum()  # Sum of diagonal elements (True Positives)
FP = conf_matrix.sum(axis=0) - np.diag(conf_matrix)  # Column-wise sum minus TP
FN = conf_matrix.sum(axis=1) - np.diag(conf_matrix)  # Row-wise sum minus TP
TN = conf_matrix.sum() - (FP + FN + TP)  # Total elements minus others

# Compute Sensitivity (Recall) & Specificity
SN = TP / (TP + FN).sum()  # Sensitivity (Recall)
SP = TN.sum() / (TN.sum() + FP.sum())  # Specificity

# Compute MCC & Kappa
mcc = matthews_corrcoef(Y_true, Y_pred)
kappa = cohen_kappa_score(Y_true, Y_pred)

# Compute MAE & MSE
mae = mean_absolute_error(Y_true, Y_pred)
mse = mean_squared_error(Y_true, Y_pred)

# Display results
print(f"\nOverall True Positives (TP): {TP}")
print(f"Overall False Positives (FP): {FP.sum()}")
print(f"Overall False Negatives (FN): {FN.sum()}")
print(f"Overall True Negatives (TN): {TN.sum()}")
print(f"\nOverall Sensitivity (Recall): {SN:.4f}")
print(f"Overall Specificity: {SP:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")

