In [1]:
#!g1.1
import os
import glob
import random
import pickle
import shutil

import plotly.express as px
import pandas as pd
import torch
import pytorch_lightning as pl
import numpy as np
# import torchtext
import torch_geometric
from tqdm import tqdm
import identify_x86_data

## Dataset loading

In [2]:
from identify_x86_graph import load_graph, load_split

# simplify debugging out-of-memory conditions by serializing all graph loading and printing graph filenames
# this can give you an idea of which graphs are too large and need to be excluded
FIX_MEM = False


In [3]:
#!g1.1
# TODO: use PyG's Dataset class

# TODO: now that we have a list of superset files we should be able to create a dataset that caches the graph conversion

from torch_geometric.data import Dataset
from torch.utils.data import random_split, Subset

class IdentifyDataset(Dataset):
    def __init__(self, root, transform=None):
        root = root.rstrip('/')
        super().__init__(root, transform)
        self.vocab = open(os.path.join(root, 'raw/code.vocab')).read().splitlines()
        self.split = load_split(os.path.join(root, 'raw/split.txt'))

    @property
    def raw_file_names(self):
        # find all .graph files in the root
        return glob.glob('**/*.graph', root_dir=os.path.join(self.root, 'raw'), recursive=True)

    @property
    def program_names(self):
        return [ self.path_to_name(path) for path in self.raw_paths ]

    @property
    def splits(self):
        return [ self.split[name] for name in self.program_names ]

    def split_indices(self, split):
        return [i for i, s in enumerate(self.splits) if s == split]
    
    def split_names(self, split):
        return [self.program_names[i] for i in self.split_indices(split)]

    @property
    def processed_file_names(self):
        return [f'{f}.pt' for f in self.raw_file_names]

    def download(self):
        pass

    def process(self):
        pass

    def path_to_name(self, path):
        return path.removesuffix('.graph').removeprefix(self.root).removeprefix('/raw/')

    def get(self, idx):
        path = self.raw_paths[idx]
        name = self.path_to_name(path)
        if FIX_MEM:
            print("Loading", name)
        data = load_graph(path, name=name)
        if self.transform is not None:
            data = self.transform(data)
        return data

    def len(self):
        return len(self.processed_file_names)

dataset = IdentifyDataset('data')

# print(dataset.program_names)
# print(dataset.splits)
# print(*dataset.split_names('test'), sep='\n')

test_dataset = Subset(dataset, dataset.split_indices('test'))
train_dataset = Subset(dataset, dataset.split_indices('train'))

# test_dataset, train_dataset = random_split(
#     dataset, 
#     [0.1, 0.9], generator=torch.Generator().manual_seed(42)
# )
VOCAB_SIZE = len(dataset.vocab)

Processing...
Done!


In [4]:
assert VOCAB_SIZE == 502 # this is hard-coded in identify_x86_model.py

In [5]:
test_dataset[0]

Data(name='byteweight/elf-x86/gcc_coreutils_32_O3_make-prime-list', num_nodes=5640, x_code=[5640], x_size=[5640], y=[5640], num_edges=52420, edge_index=[2, 52420], edge_type=[52420])

In [6]:
def dataset_size(dataset):
    return sum([ data.num_nodes for data in dataset ])


# test_size = dataset_size(test_dataset)
# train_size = dataset_size(train_dataset)
# print("Test dataset size:", test_size)
# print("Train dataset size:", train_size)
# print("Test proportion:", test_size / (train_size + test_size))
# print("Train proportion:", train_size / (train_size + test_size))

In [7]:
from identify_x86_model import LightningModel

In [8]:
model = LightningModel()

In [9]:
model

LightningModel(
  (model): IdentifyModel(
    (model): Sequential(
      (0): Embedding(15, 4)
      (1): Embedding(502, 32)
      (2): <torch.jit.ScriptFunction object at 0x7f07bbb78130>
      (3): RGCNConvJittable_906d57(36, 24, num_relations=7)
      (4): ReLU(inplace=True)
      (5): RGCNConvJittable_907592(24, 16, num_relations=7)
      (6): ReLU(inplace=True)
      (7): RGCNConvJittable_907c9f(16, 8, num_relations=7)
      (8): ReLU(inplace=True)
      (9): RGCNConvJittable_9083ad(8, 4, num_relations=7)
      (10): ReLU(inplace=True)
      (11): Linear(4, 2, bias=True)
    )
  )
  (loss): CrossEntropyLoss()
  (train_accuracy): BinaryAccuracy()
  (train_precision): BinaryPrecision()
  (train_recall): BinaryRecall()
  (train_f1): BinaryF1Score()
  (valid_accuracy): BinaryAccuracy()
  (valid_precision): BinaryPrecision()
  (valid_recall): BinaryRecall()
  (valid_f1): BinaryF1Score()
)

In [10]:
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

print("Cuda is available:", torch.cuda.is_available())

# enable medium precision to utilize tensor cores
torch.set_float32_matmul_precision('medium')

if FIX_MEM:
    num_workers = 0
else:
    num_workers = 16

train_loader = DataLoader(train_dataset, batch_size=None, shuffle=True, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=None, shuffle=False, num_workers=0)
val_loader = DataLoader(test_dataset, batch_size=None, shuffle=False, num_workers=num_workers)

model = LightningModel()
num_epochs = 2000
# num_epochs = 20
# val_check_interval = len(train_loader)

checkpoint_callback_best_f1 = ModelCheckpoint(
    monitor = 'f1/val',
    mode = 'max',
    filename = 'best-f1={f1/val:.8f}-epoch={epoch}',
    save_top_k = 4,
    auto_insert_metric_name = False,
    save_last = True,
)
checkpoint_callback_all = ModelCheckpoint(
    filename = 'all-{epoch}',
    save_top_k = -1,
)

trainer = pl.Trainer(
    # enable mixed precision training
    precision='16-mixed',
    max_epochs = num_epochs,
    # val_check_interval = val_check_interval,
    log_every_n_steps = 1,
    accelerator = 'gpu',
    # accelerator = 'cpu',
    enable_progress_bar = not FIX_MEM,
    enable_checkpointing = True,

    callbacks = [
        checkpoint_callback_best_f1,
        checkpoint_callback_all,
    ]
)
trainer.fit(model, train_loader, val_loader)

Cuda is available: True


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params
-----------------------------------------------------
0 | model           | IdentifyModel    | 27.4 K
1 | loss            | CrossEntropyLoss | 0     
2 | train_accuracy  | BinaryAccuracy   | 0     
3 | train_precision | BinaryPrecision  | 0     
4 | train_recall    | BinaryRecall     | 0     
5 | train_f1        | BinaryF1Score    | 0     
6 | valid_accuracy  | BinaryAccuracy   | 0     
7 | valid_precision | BinaryPrecision  | 0     
8 | valid_recall    | BinaryRecall     | 0     
9 | valid_f1        | BinaryF1Score    | 0     
-----------------------------------------------------
27.4 K    Trainable params
0         Non-trainable params
27.4 K    Total params
0.110     Total estimated model params size 

Epoch 134:  84%|████████▍ | 773/918 [04:41<00:52,  2.74it/s, v_num=63]     

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [11]:
model.model.model[-1].weight

Parameter containing:
tensor([[-0.2764,  0.1149,  0.7211,  0.0184],
        [-0.1471,  0.2820, -0.1687,  0.0621]], device='cuda:0',
       requires_grad=True)