# jTrans preprocessing codes for Assemblage


### 1. Flatten the assemblage binaries folder into folders

* dataset_path : Original downloaded Assemblage folder
* flatten_dir : Destination folder
* dbfile : SQLite path

In [None]:

import sys
import os
import sqlite3
import glob
from tqdm import tqdm
import shutil
import hashlib

def getmd5(s):
    return hashlib.md5(s.encode()).hexdigest()

dbfile = 'sept25.sqlite' # SQlite database file
dataset_path = 'dataset_sept25' # Path to the dataset
flatten_dir = "dataset" # Path to the flatten dataset, choose anywhere you like, but it will be deleted first!

if os.path.exists(flatten_dir):
    os.system(f"rm -rf {flatten_dir}")
os.makedirs(flatten_dir)

connection = sqlite3.connect(dbfile)
cursor = connection.cursor()

infos = cursor.execute('SELECT id, path, file_name, optimization, github_url, toolset_version FROM binaries;')
for binid, path, file_name, opt, github_url,toolset_version in tqdm(infos):
    full_path = os.path.join(dataset_path, path.replace("\\", "/"))
    if not os.path.isfile(full_path):
        print("Missing!", full_path)
        continue
    if not os.path.isdir(os.path.join(flatten_dir, str(binid))):
        os.makedirs(os.path.join(flatten_dir, str(binid)))
    # Original format datautils/dataset/libcap-git-setcap-O2-8dc43f20ea80b7703f6973a1ea86e8b8
    shutil.copy(full_path, os.path.join(flatten_dir, str(binid), f"{binid}_{file_name}-{toolset_version}-{opt}-{getmd5(github_url)}"))
    newcursor = connection.cursor()
    pdbs = newcursor.execute('SELECT pdb_path FROM pdbs where binary_id = ?', (binid,))
    for pdb in pdbs:
        full_path = os.path.join(dataset_path, pdb[0].replace("\\", "/"))
        if not os.path.isfile(full_path):
            print("Missing!", full_path)
            continue
        shutil.copy(full_path, os.path.join(flatten_dir, str(binid), os.path.basename(pdb[0].replace("\\", "/"))))

# Remove excessive pdb prefixes
import glob
import os
for f in glob.glob("{flatten_dir}/**/*", recursive=True):
    if f.endswith(".pdb"):
        dirname = os.path.dirname(f)
        basename = os.path.basename(f)
        os.rename(f, os.path.join(dirname, basename.split("_")[-1]))

### 2. Run jTrans IDA dumping code (Optional)

You can also use author's ida script, this is a multiprocessing version with everything the same

In [None]:
import os
import subprocess
import multiprocessing
import time
from util.pairdata import pairdata
from subprocess import STDOUT, check_output
import glob
import shutil

ida_path="idat64"
script_path = "./process_pe.py"

if os.path.exists("extract"):
    os.system("rm -rf extract")
if os.path.exists("log"):
    os.system("rm -rf log")
if os.path.exists("idb"):
    os.system("rm -rf idb")
os.makedirs("extract")
os.makedirs("log")
os.makedirs("idb")

def getTarget(path, prefixfilter=None):
    return [x for x in glob.glob(f'{path}/**/*', recursive=True) if os.path.isfile(x) and (prefixfilter is None or any([x.startswith(y) for y in prefixfilter]))]

def cmd_warp(cmd, timeout):
    output = check_output(cmd, stderr=STDOUT, timeout=timeout)
    print(cmd, output)
    return

start = time.time()
target_list = getTarget(flatten_dir)

pool = multiprocessing.Pool(processes=128)
for target in target_list:
    if target.lower().endswith("lib") or target.lower().endswith("pdb"):
        # Skip lib and pdb files
        continue
    filename = os.path.basename(target)
    filename_strip = filename
    cmd = [ida_path, f'-Llog/{filename}.log', '-c', '-A', f'-S{script_path}', f'-oidb/{filename}.idb', f'{target}']
    pool.apply_async(cmd_warp, args=(cmd, 600, ))


pool.close()
pool.join()

from util.pairdata_assemblage_pe import pairdata
pairdata("extract")



### 3. Check function hash by IDA and PE File Module (Optional)

It's not necessary to check hash for all functions, as Windows PE function are sometime sliced into pieces, from our experiment testing IDA recovers more than 95% correct functions with pdb, around 90% correct when given function entry address

In [None]:
import glob
import hashlib
import sys
import os
import sqlite3
import glob
from tqdm import tqdm
import shutil
import hashlib
import pickle
from hashlib import sha256

def getmd5(s):
    return hashlib.md5(s.encode()).hexdigest()

connection = sqlite3.connect(dbfile)
cursor = connection.cursor()


def sha256sum(b):
    h1 = sha256()
    h1.update(b)
    return h1.digest().hex()

# Calculate the hash of each function
for f in glob.glob("extract/**/*", recursive=True):
    if f.endswith("saved_index.pkl"):
        with open(f, "rb") as f:
            saved_index = pickle.load(f)
        for x in saved_index:
            print(x)
            for y in saved_index[x]:
                if type(y[-3]) == bytes:
                    hash = sha256sum(y[-3])
                    infos = cursor.execute(f'SELECT name FROM functions where hash="{hash}";')
                    for q in infos:
                        if q[0]!=x:
                            print("Name diff", q[0])
                            # It is probably fine, as some functions may be have same bytes
