# üîç Vulnerability Detection v·ªõi GNN

Nh·∫≠n di·ªán l·ªói b·∫£o m·∫≠t Java code (SQL Injection, Command Injection, Path Traversal, Buffer Overflow)

## 1. Setup Environment

In [None]:
# Clone repository
!git clone https://github.com/CatEatSad/DoAn_2.git
%cd DoAn_2

In [None]:
# Check files
!echo "=== Checking vulnerable files ==="
!find output -name "*.json" -not -name "*prediction*" | wc -l

!echo "\n=== Checking safe files ==="
!find output_safe -name "*.json" | wc -l

!echo "\n=== Sample files ==="
!ls output/Command_Injection/ | head -5

In [None]:
# Install dependencies
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q torch-geometric
!pip install -q pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install -q transformers scikit-learn pandas tqdm

In [None]:
# Verify installation
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Test Data Loading

In [None]:
import sys
sys.path.insert(0, '/content/DoAn_2/src')

from data_loader import VulnerabilityDataset

# Load dataset
print("Loading dataset...")
dataset = VulnerabilityDataset('/content/DoAn_2', split='all')

print(f"\nTotal samples: {len(dataset)}")

if len(dataset) > 0:
    sample = dataset[0]
    print(f"\nFirst sample:")
    print(f"  Nodes: {sample.num_nodes}")
    print(f"  Edges: {sample.edge_index.shape[1]}")
    print(f"  Label: {sample.y.item()}")
else:
    print("‚ö†Ô∏è ERROR: No data found!")
    print("Please check if JSON files exist in output/ and output_safe/")

## 3. Quick Training (10 epochs)

In [None]:
from data_loader import create_dataloaders
from model import create_model
from train import Trainer

# Create dataloaders
print("Creating dataloaders...")
train_loader, val_loader, test_loader = create_dataloaders(
    '/content/DoAn_2',
    batch_size=8,
    train_ratio=0.7,
    val_ratio=0.15
)

# Create model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nCreating model on {device}...")
model = create_model('simplified', num_classes=5, device=device)

# Create trainer
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    device=device,
    lr=1e-3,
    num_epochs=10  # Quick test
)

# Train
trainer.train()

# Test
test_metrics = trainer.test()

## 4. Full Training (50 epochs) - Optional

In [None]:
# Uncomment to run full training
# %cd /content/DoAn_2/src
# !python train.py

## 5. Visualize Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
axes[0].plot(trainer.train_losses, label='Train Loss', marker='o')
axes[0].plot(trainer.val_losses, label='Val Loss', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(trainer.val_accuracies, label='Val Accuracy', marker='o', color='green')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Best Validation Accuracy: {trainer.best_val_acc:.4f}")

In [None]:
# Confusion Matrix
labels = ['Safe', 'Buffer', 'Cmd', 'Path', 'SQL']

plt.figure(figsize=(10, 8))
sns.heatmap(
    test_metrics['confusion_matrix'],
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    cbar_kws={'label': 'Count'}
)
plt.title('Confusion Matrix - Test Set', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

## 6. Analyze Predictions

In [None]:
from predict import VulnerabilityPredictor
import os

# Save model first
save_path = '/content/DoAn_2/saved_models/best_model.pth'
os.makedirs(os.path.dirname(save_path), exist_ok=True)
trainer.save_model(save_path)

# Load predictor
predictor = VulnerabilityPredictor(save_path, device='cuda')

In [None]:
# Analyze example files
examples = [
    '/content/DoAn_2/output/Command_Injection/Command_Injection_0001_vul.json',
    '/content/DoAn_2/output/SQL_Injection/SQL_Injection_0001_vul.json',
    '/content/DoAn_2/output_safe/Command_Injection/Command_Injection_0001.json',
]

for example_file in examples:
    if os.path.exists(example_file):
        print("\n" + "="*70)
        print(f"File: {os.path.basename(example_file)}")
        print("="*70)
        
        analysis = predictor.analyze_code_patterns(example_file)
        
        print("\nPatterns Detected:")
        for pattern in analysis['patterns_detected']:
            print(f"  [{pattern['type']:20s}] Line {pattern.get('line', 'N/A'):5s}: {pattern['code'][:60]}")
        
        if analysis['risk_factors']:
            print("\nRisk Factors:")
            for risk in analysis['risk_factors']:
                print(f"  ‚ö†Ô∏è  {risk['severity']} - {risk['type']}")
                print(f"      {risk['reason']}")
        else:
            print("\n‚úì No risk factors detected")

## 7. Save to Google Drive (Optional)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Copy model
!cp /content/DoAn_2/saved_models/best_model.pth /content/drive/MyDrive/vulnerability_model.pth
print("‚úì Model saved to Google Drive!")

## 8. Download Model

In [None]:
from google.colab import files

# Download model
files.download('/content/DoAn_2/saved_models/best_model.pth')
print("‚úì Model downloaded!")