In [None]:
from datasets import load_from_disk
from pprint import pprint
from loguru import logger
from pathlib import Path
import pickle
import random
import zstd

We delete the output of the block above, as it will output a warning prompt containing identity information.

In [2]:
def decompress_data(b_str):
    return pickle.loads(zstd.decompress(b_str))
def compress_data(obj):
    return zstd.compress(pickle.dumps(obj))

In [3]:
ds = load_from_disk("data/raw")

In [4]:
example_row = random.choice(ds)

In [5]:
asm = decompress_data(example_row["asm"])
source_code = example_row["source_code"]
addr_to_source_code_file_line = decompress_data(example_row["addr_to_source_code_file_line"])

In [6]:
print("asm_code:")
for line, code_row in enumerate(asm["code"]):
    print(f"asm_line {line}: {code_row}")

asm_code:
asm_line 0: ['endbr64']
asm_line 1: ['sub', 'rsp', '8']
asm_line 2: ['mov', 'r10', 'rsi']
asm_line 3: ['mov', 'r11d', '[rsp+8+arg_0]']
asm_line 4: ['mov', 'eax', '[rsp+8+arg_8]']
asm_line 5: ['mov', 'esi', '[rsp+8+arg_10]']
asm_line 6: ['test', 'rcx', 'rcx']
asm_line 7: ['jz', 'loc_A4730']
asm_line 8: ['test', 'r9d', 'r9d']
asm_line 9: ['jle', 'loc_A46F8']
asm_line 10: ['test', 'r11d', 'r11d']
asm_line 11: ['jle', 'loc_A4710']
asm_line 12: ['cmp', 'r9d', 'r11d']
asm_line 13: ['jg', 'loc_A46D0']
asm_line 14: ['mov', '[rsp+8+arg_10]', 'esi']
asm_line 15: ['mov', 'rsi', 'r10']
asm_line 16: ['mov', '[rsp+8+arg_8]', 'eax']
asm_line 17: ['mov', '[rsp+8+arg_0]', 'r11d']
asm_line 18: ['add', 'rsp', '8']
asm_line 19: ['jmp', 'sub_9C100']
asm_line 20: ['mov', 'ecx', 'r11d']
asm_line 21: ['mov', 'edx', 'r9d']
asm_line 22: ['lea', 'rsi', 'aCp9Hmm2ijbands_2']
asm_line 23: ['mov', 'rdi', 'r10']
asm_line 24: ['xor', 'eax', 'eax']
asm_line 25: ['call', 'sub_1A7E30']
asm_line 26: ['mov', 'eax

In [7]:
print("Addr to asm code line map:")
for k, v in asm['anchor_map'].items():
    print(f"{hex(k)}: asm_line {v}")

Addr to asm code line map:
0xa4688: asm_line 2
0xa468b: asm_line 3
0xa4690: asm_line 4
0xa4694: asm_line 5
0xa4698: asm_line 6
0xa469b: asm_line 7
0xa46a1: asm_line 8
0xa46a4: asm_line 9
0xa46a6: asm_line 10
0xa46a9: asm_line 11
0xa46ab: asm_line 12
0xa46ae: asm_line 13
0xa46b4: asm_line 15
0xa46b7: asm_line 16
0xa46bb: asm_line 17
0xa46d0: asm_line 20
0xa46d3: asm_line 21
0xa46d6: asm_line 22
0xa46dd: asm_line 23
0xa46e0: asm_line 24
0xa46e2: asm_line 25
0xa46f8: asm_line 29
0xa46fb: asm_line 30
0xa4702: asm_line 31
0xa4705: asm_line 32
0xa4707: asm_line 33
0xa470c: asm_line 34
0xa4710: asm_line 35
0xa4713: asm_line 36
0xa471a: asm_line 37
0xa471d: asm_line 38
0xa471f: asm_line 39
0xa4724: asm_line 40
0xa4730: asm_line 41
0xa4737: asm_line 42
0xa473a: asm_line 43
0xa473c: asm_line 44
0xa4741: asm_line 45


In the `anchor_map` above, we only recorded the assembly code addresses corresponding to the source code line numbers, which will serve as the basis for extracting binary code snippets in the future.

In [8]:
print("source_code:")
source_lines = source_code.split("\n")
for lino, code_line in enumerate(source_lines):
    print(f"source_line {lino + addr_to_source_code_file_line['line'][0]}: {code_line}")

source_code:
source_line 1519: int
source_line 1520: cp9_HMM2ijBands(CM_t *cm, char *errbuf, CP9_t *cp9, CP9Bands_t *cp9b, CP9Map_t *cp9map, int i0, int j0, int doing_search, int do_trunc, int debug_level)
source_line 1521: {
source_line 1522: 
source_line 1523:   int status;
source_line 1524:   int v;
source_line 1525: 
source_line 1526:   /* ptrs to cp9b data, for convenience */
source_line 1527:   int *imin;          /* imin[v] = first position in band on i for state v to be filled in this function. [1..M] */
source_line 1528:   int *imax;          /* imax[v] = last position in band on i for state v to be filled in this function. [1..M] */
source_line 1529:   int *jmin;          /* jmin[v] = first position in band on j for state v to be filled in this function. [1..M] */
source_line 1530:   int *jmax;          /* jmax[v] = last position in band on j for state v to be filled in this function. [1..M] */
source_line 1531:   
source_line 1532:   int nd;                  /* counter over 

In [9]:
print("Binary code address to source code line mapping:")
for k, v in addr_to_source_code_file_line['mapping'].items():
    print(f"binary address {hex(k)}: source line {v[1]}")

Binary code address to source code line mapping:
binary address 0xa4688: source line 1521
binary address 0xa468b: source line 1521
binary address 0xa4690: source line 1521
binary address 0xa4694: source line 1521
binary address 0xa4698: source line 1570
binary address 0xa469b: source line 1570
binary address 0xa46a1: source line 1571
binary address 0xa46a4: source line 1571
binary address 0xa46f8: source line 1571
binary address 0xa46fb: source line 1571
binary address 0xa4702: source line 1571
binary address 0xa4705: source line 1571
binary address 0xa4707: source line 1571
binary address 0xa470c: source line 1571
binary address 0xa46a6: source line 1572
binary address 0xa46a9: source line 1572
binary address 0xa4710: source line 1572
binary address 0xa4713: source line 1572
binary address 0xa471a: source line 1572
binary address 0xa471d: source line 1572
binary address 0xa471f: source line 1572
binary address 0xa4724: source line 1572
binary address 0xa46ab: source line 1573
binary a