<a href="https://colab.research.google.com/github/Bladetuab/DragonMS/blob/main/gnn_mal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/safreita1/malnet-graph.git

fatal: destination path 'malnet-graph' already exists and is not an empty directory.


In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.12.0+cu113


In [1]:
import os
import sys
import copy
import time
import torch
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import torch.nn.functional as F
import torch_geometric.transforms as T

In [2]:
from tqdm import tqdm
# from joblib import Parallel, delayed
from torch_geometric.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, f1_score

In [3]:
import sys
sys.path.insert(0,'/content/malnet-graph')
sys.path.insert(0,'/content/malnet-graph/gnn')

In [35]:
from config import args
from utils import get_split_info
from dataloader import MalnetDataset
from models import GIN, GraphSAGE, MLP, GCN, SGC
from process import NodeDegree, save_model, log_info, convert_files_pytorch

In [44]:
gnn_models = {
    'gin': GIN,
    'graphsage': GraphSAGE,
    'mlp': MLP,
    'gcn': GCN,
    'sgc': SGC
}

node_features = {
    'ldp': T.LocalDegreeProfile(),
    'constant': T.Constant(),
    'degree': NodeDegree()
}

In [47]:
args.update({
        'group': 'type',
        'train_ratio': 1.0,
        'malnet_tiny': True
    })

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
!tar -xvf  '/content/gdrive/MyDrive/malnet-graphs-tiny.tar' -C 'malnet-graphs-tiny'

tar: /content/gdrive/MyDrive/malnet-graphs-tiny.tar: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [48]:
files_train, files_val, files_test, train_labels, val_labels, test_labels, label_dict = get_split_info(args)

Number of train samples: 3500, val samples: 500, test samples: 1000


In [29]:
print(files_train[1])

/content/malnet-graph/malnet-graphs-tinyadware/airpush/CF949566DB63C8833C1D9E7B0A87E6B727D98A08D553AFE7B5DF7582F5B7C89D.edgelist


In [39]:
args['data_dir'] = os.getcwd() + '/data/malnet_tiny={}/group={}/train_ratio={}/node_feature={}/directed_graph={}' \
                                     '/remove_isolates={}/lcc_only={}/add_self_loops={}/'.format(args['malnet_tiny'], args['group'],
                                                        args['train_ratio'], args['node_feature'], args['directed_graph'],
                                                        args['remove_isolates'], args['lcc_only'], args['add_self_loops'])

In [40]:
train_dir = args['data_dir'].replace('/data/', '/data/train/')
val_dir = args['data_dir'].replace('/data/', '/data/val/').replace('/train_ratio={}'.format(args['train_ratio']), '/train_ratio=1.0')
test_dir = args['data_dir'].replace('/data/', '/data/test/').replace('/train_ratio={}'.format(args['train_ratio']), '/train_ratio=1.0')

In [42]:
print(train_dir)

/content/data/train/malnet_tiny=True/group=type/train_ratio=1.0/node_feature=ldp/directed_graph=True/remove_isolates=False/lcc_only=False/add_self_loops=True/


In [41]:
convert_files_pytorch(args, files_train, train_dir, node_features[args['node_feature']])
convert_files_pytorch(args, files_val, val_dir, node_features[args['node_feature']])
convert_files_pytorch(args, files_test, test_dir, node_features[args['node_feature']])



  0%|          | 0/3500 [00:00<?, ?it/s][A[A

  0%|          | 4/3500 [00:02<31:30,  1.85it/s][A[A

FileNotFoundError: ignored

In [21]:
def get_parameter_count(model):
    return sum(p.numel() for p in model.parameters())


In [22]:
def train(model, device, optimizer, train_loader, train_dataset, epoch):
    model.train()

    loss_all = 0
    for data in tqdm(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(output, data.y)
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()

    return loss_all / len(train_dataset)

In [23]:
def test(model, device, loader):
    model.eval()

    y_true, y_pred, y_scores = [], [], []
    for data in loader:
        data = data.to(device)
        output = model(data.x, data.edge_index, data.batch)
        pred = output.max(dim=1)[1]

        y_true.extend(data.y.detach().cpu().numpy().tolist())
        y_pred.extend(pred.detach().cpu().numpy().tolist())
        y_scores.extend(output[:, 1].tolist())  # only used in binary setting

    return y_pred, y_scores, y_true


In [24]:
def train_model(args, device, train_dataset, train_loader, val_loader, test_loader):
    model = gnn_models[args['model']](args).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
    writer = SummaryWriter(log_dir=args['log_dir'])

    best_val_score = 0
    for epoch in range(1, args['epochs'] + 1):

        start = time.time()
        train_loss = train(model, device, optimizer, train_loader, train_dataset, epoch)
        end = time.time()

        y_pred_val, y_scores_val, y_true_val = test(model, device, val_loader)
        y_pred_test, y_scores_test, y_true_test = test(model, device, test_loader)

        val_score = accuracy_score(y_true_val, y_pred_val) if args['metric'] == 'acc' else f1_score(y_true_val, y_pred_val, average='macro')
        test_score = accuracy_score(y_true_test, y_pred_test) if args['metric'] == 'acc' else f1_score(y_true_test, y_pred_test, average='macro')

        writer.add_scalars(main_tag='Tiny={}, train_ratio={}, group={} model={}, layers={}, hidden_dims={}, learning_rate={}, dropout={}'.format(
                args['malnet_tiny'], args['train_ratio'], args['group'], args['model'], args['num_layers'], args['hidden_dim'], args['lr'], args['dropout']),
            global_step=epoch,
            tag_scalar_dict={'Validation {}'.format(args['metric']): val_score, 'Test {}'.format(args['metric']): test_score}
        )

        with open(args['log_dir'] + 'train_results.txt', 'a') as f:
            f.write('Tiny={}, group={}, train_ratio={} Epoch={}, time={} seconds, model={}, # parameters={}, layers={}, hidden_dims={}, learning_rate={}, dropout={}, train_loss={}, val_{}={}, test_{}={}\n'.format(
                args['malnet_tiny'], args['group'], args['train_ratio'], epoch, round(end-start, 2), args['model'], get_parameter_count(model), args['num_layers'], args['hidden_dim'], args['lr'], args['dropout'], train_loss, args['metric'], val_score, args['metric'], test_score))

        if not args['quiet']: print('Epoch: {:03d}, Train Loss: {:.7f}, Val {}: {:.7f}'.format(epoch, train_loss, args['metric'], val_score))

        if val_score > best_val_score:
            if not args['quiet']: print('Improved val {} from {} to {} at epoch {}. Saving and logging model.'.format(args['metric'], best_val_score, val_score, epoch))
            best_val_score = val_score

            save_model(args, model)
            log_info(args, epoch, y_true_val, y_pred_val, y_scores_val, param_count=0, run_time=0, data_type='val')

    print('Best val {}: {}'.format(args['metric'], best_val_score))

    # load best model
    model.load_state_dict(torch.load(args['log_dir'] + 'best_model.pt'))
    model.eval()

    return model

In [25]:
def run_experiment(args_og):
    args = copy.deepcopy(args_og)

    if args['model'] != 'sgc': args['K'] = 0

    args['log_dir'] = os.getcwd() + '/results/malnet_tiny={}/group={}/train_ratio={}/node_feature={}/directed_graph={}' \
                                    '/remove_isolates={}/lcc_only={}/add_self_loops={}/model={}/K={}/hidden_dim={}' \
                                    '/num_layers={}/lr={}/dropout={}/epochs={}/'.format(args['malnet_tiny'], args['group'],
                                                        args['train_ratio'], args['node_feature'], args['directed_graph'],
                                                        args['remove_isolates'], args['lcc_only'], args['add_self_loops'],
                                                        args['model'], args['K'], args['hidden_dim'], args['num_layers'],
                                                        args['lr'], args['dropout'], args['epochs'])

    args['data_dir'] = os.getcwd() + '/data/malnet_tiny={}/group={}/train_ratio={}/node_feature={}/directed_graph={}' \
                                     '/remove_isolates={}/lcc_only={}/add_self_loops={}/'.format(args['malnet_tiny'], args['group'],
                                                        args['train_ratio'], args['node_feature'], args['directed_graph'],
                                                        args['remove_isolates'], args['lcc_only'], args['add_self_loops'])
    os.makedirs((args['log_dir']), exist_ok=True)

    train_dir = args['data_dir'].replace('/data/', '/data/train/')
    val_dir = args['data_dir'].replace('/data/', '/data/val/').replace('/train_ratio={}'.format(args['train_ratio']), '/train_ratio=1.0')
    test_dir = args['data_dir'].replace('/data/', '/data/test/').replace('/train_ratio={}'.format(args['train_ratio']), '/train_ratio=1.0')

    files_train, files_val, files_test, train_labels, val_labels, test_labels, label_dict = get_split_info(args)
    convert_files_pytorch(args, files_train, train_dir, node_features[args['node_feature']])
    convert_files_pytorch(args, files_val, val_dir, node_features[args['node_feature']])
    convert_files_pytorch(args, files_test, test_dir, node_features[args['node_feature']])

    train_dataset = MalnetDataset(args, root=train_dir, files=files_train, labels=train_labels)
    val_dataset = MalnetDataset(args, root=val_dir, files=files_val, labels=val_labels)
    test_dataset = MalnetDataset(args, root=test_dir, files=files_test, labels=test_labels)

    train_loader = DataLoader(train_dataset, batch_size=args['batch_size'])
    val_loader = DataLoader(val_dataset, batch_size=args['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=args['batch_size'])

    args['num_classes'] = train_dataset.num_classes
    args['num_features'] = train_dataset.num_features
    args['class_indexes'] = list(label_dict.values())
    args['class_labels'] = list(label_dict.keys())
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    start = time.time()
    model = train_model(args, device, train_dataset, train_loader, val_loader, test_loader)
    run_time = round(time.time() - start, 2)

    param_count = get_parameter_count(model)
    y_pred_val, y_scores_val, y_true_val = test(model, device, val_loader)
    y_pred_test, y_scores_test, y_true_test = test(model, device, test_loader)
    log_info(args, args['epochs'], y_true_test, y_pred_test, y_scores_test, param_count, run_time=run_time, data_type='test')

    val_score = accuracy_score(y_true_val, y_pred_val) if args['metric'] == 'acc' else f1_score(y_true_val, y_pred_val, average='macro')
    test_score = accuracy_score(y_true_test, y_pred_test) if args['metric'] == 'acc' else f1_score(y_true_test, y_pred_test, average='macro')

    return val_score, test_score, param_count, run_time

In [27]:
run_experiment(args)

Number of train samples: 3500, val samples: 500, test samples: 1000


  0%|          | 4/3500 [00:02<33:18,  1.75it/s]

FileNotFoundError: ignored