In [1]:
#!g1.1
import os
import glob
import random
import pickle
import shutil

import plotly.express as px
import pandas as pd
import torch
import pytorch_lightning as pl
import numpy as np
# import torchtext
import torch_geometric
from tqdm import tqdm
import identify_x86_data

## Dataset loading

In [2]:
from identify_x86_graph import load_graph


In [3]:
#!g1.1
# TODO: use PyG's Dataset class

# TODO: now that we have a list of superset files we should be able to create a dataset that caches the graph conversion

from torch_geometric.data import Dataset
from torch.utils.data import random_split

class IdentifyDataset(Dataset):
    def __init__(self, root, transform=None):
        super().__init__(root, transform)
        self.vocab = open(os.path.join(root, 'raw/code.vocab')).read().splitlines()

    @property
    def raw_file_names(self):
        # find all .graph files in the root
        return glob.glob('**/*.graph', root_dir=os.path.join(self.root, 'raw'), recursive=True)

    @property
    def processed_file_names(self):
        return [f'{f}.pt' for f in self.raw_file_names]

    def download(self):
        pass

    def process(self):
        pass
    
    def get(self, idx):
        print("Loading", self.raw_paths[idx])
        data = load_graph(self.raw_paths[idx])
        if self.transform is not None:
            data = self.transform(data)
        return data

    def len(self):
        return len(self.processed_file_names)

dataset = IdentifyDataset('data')
test_dataset, train_dataset = random_split(
    dataset, 
    [0.1, 0.9], generator=torch.Generator().manual_seed(42)
)
VOCAB_SIZE = len(dataset.vocab)

Processing...
Done!


In [4]:
assert VOCAB_SIZE == 502 # this is hard-coded in identify_x86_model.py

In [5]:
test_dataset[0]

Loading data/raw/debian/buster/pulseaudio/usr_lib_pulse-12.2_modules_module-ladspa-sink.graph


Data(num_nodes=47124, x_code=[47124], x_size=[47124], y=[47124], num_edges=1446766, edge_index=[2, 1446766], edge_type=[1446766])

In [6]:
from identify_x86_model import LightningModel

In [7]:
model = LightningModel()

In [8]:
model

LightningModel(
  (model): IdentifyModel(
    (model): Sequential(
      (0): Embedding(15, 4)
      (1): Embedding(502, 32)
      (2): <torch.jit.ScriptFunction object at 0x7fc7c6830450>
      (3): RGCNConvJittable_2cbda8(36, 24, num_relations=7)
      (4): ReLU(inplace=True)
      (5): RGCNConvJittable_2cc54c(24, 16, num_relations=7)
      (6): ReLU(inplace=True)
      (7): RGCNConvJittable_2ccc99(16, 8, num_relations=7)
      (8): ReLU(inplace=True)
      (9): RGCNConvJittable_2cd3e5(8, 4, num_relations=7)
      (10): ReLU(inplace=True)
      (11): Linear(4, 2, bias=True)
    )
  )
  (train_accuracy): BinaryAccuracy()
  (train_precision): BinaryPrecision()
  (train_recall): BinaryRecall()
  (train_f1): BinaryF1Score()
  (valid_accuracy): BinaryAccuracy()
  (valid_precision): BinaryPrecision()
  (valid_recall): BinaryRecall()
  (valid_f1): BinaryF1Score()
)

In [9]:
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

print("Cuda is available:", torch.cuda.is_available())

# enable medium precision to utilize tensor cores
torch.set_float32_matmul_precision('medium')

train_loader = DataLoader(train_dataset, batch_size=None, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=None, shuffle=False, num_workers=0)
val_loader = DataLoader(test_dataset, batch_size=None, shuffle=False, num_workers=0)

model = LightningModel()
num_epochs = 2000
# num_epochs = 20
# val_check_interval = len(train_loader)

checkpoint_callback_best_f1 = ModelCheckpoint(
    monitor = 'f1/val',
    mode = 'max',
    filename = 'best-f1={f1/val:.8f}-epoch={epoch}',
    save_top_k = 4,
    auto_insert_metric_name = False,
    save_last = True,
)
checkpoint_callback_all = ModelCheckpoint(
    filename = 'all-{epoch}',
    save_top_k = -1,
)

trainer = pl.Trainer(
    # enable mixed precision training
    precision=16,
    max_epochs = num_epochs,
    # val_check_interval = val_check_interval,
    log_every_n_steps = 1,
    accelerator = 'gpu',
    # accelerator = 'cpu',
    enable_progress_bar = False,
    enable_checkpointing = True,

    callbacks = [
        checkpoint_callback_best_f1,
        checkpoint_callback_all,
    ]
)
trainer.fit(model, train_loader, val_loader)

Cuda is available: True


  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type            | Params
----------------------------------------------------
0 | model           | IdentifyModel   | 27.4 K
1 | train_accuracy  | BinaryAccuracy  | 0     
2 | train_precision | BinaryPrecision | 0     
3 | train_recall    | BinaryRecall    | 0     
4 | train_f1        | BinaryF1Score   | 0     
5 | valid_accuracy  | BinaryAccuracy  | 0     
6 | valid_precision | BinaryPrecision | 0     
7 | valid_recall    | BinaryRecall    | 0     
8 | valid_f1        | BinaryF1Score   | 0     
----------------------------------------------------
27.4 K    Trainable params
0         Non-trainable params
27.4 K    Total params
0.110     Total estimated model params size (MB)
  rank_zero_warn(


Loading data/raw/debian/buster/pulseaudio/usr_lib_pulse-12.2_modules_module-ladspa-sink.graph
Loading data/raw/debian/buster/plasma-desktop/usr_lib_i386-linux-gnu_qt5_plugins_plasma_dataengine_plasma_engine_touchpad.graph


  rank_zero_warn(


Loading data/raw/debian/buster/coreutils/usr_bin_nice.graph
Loading data/raw/debian/buster/plasma-desktop/usr_lib_i386-linux-gnu_qt5_qml_org_kde_plasma_activityswitcher_libactivityswitcherextensionplugin.graph
Loading data/raw/byteweight/elf-x86/gcc_findutils_32_O0_locate.graph
Loading data/raw/debian/buster/coreutils/usr_bin_tail.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O1_ginstall.graph
Loading data/raw/debian/buster/okular/usr_lib_i386-linux-gnu_qt5_plugins_okular_generators_okularGenerator_poppler.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O1_getlimits.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O2_uniq.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O2_runcon.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O0_unlink.graph
Loading data/raw/byteweight/elf-x86/gcc_findutils_32_O1_frcode.graph
Loading data/raw/byteweight/elf-x86/gcc_coreutils_32_O3_numfmt.graph
Loading data/raw/debian/buster/pulseaudio/usr_lib_puls

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
#!g1.1
# TODO: this is not that easy...
# model.eval()
# for param in model.parameters():
    # param.requires_grad = False
model_jit = torch.jit.script(model.model)
print("Model JIT:", model_jit)

model_jit.save(f"model_jit.pt")

# torch.onnx.export(model_jit, (
#     torch.tensor([0], dtype=torch.long),
#     torch.tensor([0], dtype=torch.long),
#     torch.tensor([[0, 0]], dtype=torch.long),
#     torch.tensor([0], dtype=torch.long),
# ), f"{EXECUTABLE}.onnx", verbose=True)
# with torch.no_grad():

    # torch.onnx.export(model, (G.x_code, G.x_size, G.edge_index, G.edge_type), f"{EXECUTABLE}.onnx", verbose=True)
# torch.jit.save(model, f"{EXECUTABLE}.pt")

Model JIT: RecursiveScriptModule(
  original_name=IdentifyModel
  (model): RecursiveScriptModule(
    original_name=Sequential_d2a63c
    (module_0): RecursiveScriptModule(original_name=Embedding)
    (module_1): RecursiveScriptModule(original_name=Embedding)
    (module_3): RecursiveScriptModule(
      original_name=RGCNConvJittable_d28bee
      (aggr_module): RecursiveScriptModule(original_name=MeanAggregation)
    )
    (module_4): RecursiveScriptModule(original_name=ReLU)
    (module_5): RecursiveScriptModule(
      original_name=RGCNConvJittable_d2941e
      (aggr_module): RecursiveScriptModule(original_name=MeanAggregation)
    )
    (module_6): RecursiveScriptModule(original_name=ReLU)
    (module_7): RecursiveScriptModule(
      original_name=RGCNConvJittable_d29c7f
      (aggr_module): RecursiveScriptModule(original_name=MeanAggregation)
    )
    (module_8): RecursiveScriptModule(original_name=ReLU)
    (module_9): RecursiveScriptModule(
      original_name=RGCNConvJittable_d

In [None]:
model.model.model[-1].weight

Parameter containing:
tensor([[ 0.4243,  0.7238,  0.3613,  0.2197],
        [ 0.2963, -0.8422, -0.1653,  0.4024]], requires_grad=True)