# Assemblage XDA test code

This notebook contains codes to evaluate [XDA](https://github.com/CUMLSec/XDA) on [Assemblage](https://assemblage-dataset.net/) data.

The below code block gets XDA prediction files based on author's code, input folder is `data-raw/assm/topredict` and output file locates `data-raw/assm/xda_preds`.
Please follow XDA setup [README](https://github.com/CUMLSec/XDA) first.

In [None]:
from fairseq.models.roberta import RobertaModel
import os
from collections import defaultdict
from colorama import Fore, Back, Style
import torch
import sys

XDA_PRED_PATH = "./data-raw/assm/xda_preds"

chunk_size = 500

def int2hex(s):
    return s

def predict(filename, model):
    print(f"Predicting {filename}")
    if os.path.exists(os.path.join(XDA_PRED_PATH, os.path.basename(filename))):
        print(f"{filename} already predicted")
        return

    with open(filename, 'r') as f:
        lines = f.readlines()
    total = []
    for line in lines:
        total.append(line.strip().lower())

    func_start = []
    func_end = []

    xda_pred = []

    for i in tqdm(range(0, len(total)+chunk_size, chunk_size)):
        tokens = total[i:i + chunk_size]
        if len(tokens) == 0:
            break

        encoded_tokens = model.encode(' '.join(tokens))
        logprobs = model.predict('funcbound', encoded_tokens)
        labels = logprobs.argmax(dim=2).view(-1).data


        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 0:
                xda_pred.append(f"{token} -")
            elif label == 1:
                func_end.append((i, token))
                xda_pred.append(f"{token} R")
            elif label == 2:
                func_start.append((i, token))
                xda_pred.append(f"{token} F")
    with open(os.path.join(XDA_PRED_PATH, os.path.basename(filename)), 'w') as f:
        f.write('\n'.join(xda_pred))

roberta = RobertaModel.from_pretrained('checkpoints/finetune_msvs_funcbound_64', 'checkpoint_best.pt',
                                       'data-bin/funcbound_msvs_64', bpe=None, user_dir='finetune_tasks')


from tqdm import tqdm
import multiprocessing

pool = multiprocessing.Pool(32)
files = os.listdir('data-raw/assm/topredict/')
import random
random.shuffle(files)
for x in tqdm(files):
    pool.apply_async(predict, args=(f'data-raw/assm/topredict/{x}', roberta))
pool.close()
pool.join()


Then evaluate the F1 score with script provided by [XDA](https://github.com/CUMLSec/XDA/blob/main/scripts/play/play_func_bound.py)