### Import necessary modules

In [45]:
import os
import json
import zipfile
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from IPython.display import Javascript

In [46]:
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 

### Important constants (expected from user)

In [47]:
DATA_DIR = "11.03"
INSTRUCTIONS_INFO_FILE = "x86-64_instructions.json"

### Unpack archives and get tables

In [48]:
FILES_DIR = f"{DATA_DIR}_files"

In [49]:
DFS = dict()
for archive in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, archive)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(FILES_DIR)
    DFS[archive.split('_')[0]] = pd.read_csv(f"{os.path.join(FILES_DIR, archive)[:-3]}csv")

### Prepare instructions data

In [50]:
INSTRUCTION_PAGES = dict()
with open("x86doc/index.html") as file:
    index_text = BeautifulSoup(file)
entries = index_text.find_all("a")[1:]
for entry in entries:
    INSTRUCTION_PAGES[entry.get_text()] = entry["href"][1:]

In [51]:
INSTRUCTIONS_INFO = dict()
with open(INSTRUCTIONS_INFO_FILE, "r") as read_file:
    instructions_info = json.load(read_file)["instructions"]
for item in instructions_info:
    INSTRUCTIONS_INFO[item["instruction"]] = {"category": item["category"],
                                              "group": item["group"],
                                              "description": item["description"]}

### Define utility functions

Comments:
* All functions are able to work with the beginning of a key
* Key is usially OS

In [52]:
def add_dataframe(
    name: str,
    dataframe: pd.DataFrame
):
    DFS[name] = dataframe

In [53]:
def get_dataframe(
    key: str
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            return DFS[df_key]

In [54]:
def remove_dataframe(
    key: str
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            DFS.pop(df_key)
            return

In [55]:
def remove_filename_column(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].drop("filename", axis=1)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [56]:
def head(
    key: str,
    number_of_rows: int=5,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df_head = DFS[df_key].head(number_of_rows)
            if show:
                print(f"{df_key}:")
                display(df_head)
            return df_head

In [57]:
# If keys == None, all keys will be used
def find_instruction(
    instruction: str,
    keys: None|list[str]=None,
    only_filenames: bool=True,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    if keys is None:
        keys = list(DFS)
    tables = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                if key in DFS and df_key != key:
                    continue
                try:
                    table = DFS[df_key][DFS[df_key][instruction] != 0]
                    if only_filenames:
                        table = list(table.filename)
                    tables[df_key] = table
                except KeyError:
                    tables[df_key] = None
    if show:
        for key in tables:
            print(f"{key}:")
            if tables[key] is None:
                continue
            if only_filenames:
                for filename in tables[key]:
                    print(f"\t{filename}")
            else:
                display(tables[key].head(number_of_rows_to_show))
            print()
            
    return tables

In [58]:
def what_is_instruction(
    instruction: str
):
    try:
        path = f"x86doc{INSTRUCTION_PAGES[instruction.upper()]}"
        display(Javascript('window.open("{url}");'.format(url=path)))
    except:
        print("Instruction is not found.")

In [59]:
# Use this function, for example, to leave only files existing in a particular directory
def files_starting_with(
    key: str,
    beginning: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if i.startswith(beginning)]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [60]:
# Use this function, for example, to leave only row related to a particular executable file
def files_containing(
    key: str,
    substring: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if substring in i]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [61]:
def total_instruction_usage(
    key: str,
    to_dict: bool=False,
    show: bool=False,
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = remove_filename_column(df_key)
            total = df.sum()
            total_dict = dict(total)
            if show:
                for instruction in total_dict:
                    print(f"{instruction}: {total_dict[instruction]}")
            if to_dict:
                return total_dict
            return total

In [62]:
def divide_into_categories(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].copy()
            columns = list(df.columns)
            for column in columns:
                column_upper = column.upper()
                if column == "filename":
                    continue
                if column_upper in INSTRUCTIONS_INFO:
                    category = INSTRUCTIONS_INFO[column_upper]["category"]
                elif column_upper[:-1] in INSTRUCTIONS_INFO:
                    category = INSTRUCTIONS_INFO[column_upper[:-1]]["category"]
                else:
                    category = "Other"
                if category not in df.columns:
                    df[category] = df[column]
                else:
                    df[category] += df[column]
                df.drop(column, axis=1, inplace=True)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [63]:
def divide_into_groups(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].copy()
            columns = list(df.columns)
            for column in columns:
                column_upper = column.upper()
                if column == "filename":
                    continue
                if column_upper in INSTRUCTIONS_INFO:
                    group = INSTRUCTIONS_INFO[column_upper]["group"]
                elif column_upper[:-1] in INSTRUCTIONS_INFO:
                    group = INSTRUCTIONS_INFO[column_upper[:-1]]["group"]
                else:
                    group = "Other"
                if group not in df.columns:
                    df[group] = df[column]
                else:
                    df[group] += df[column]
                df.drop(column, axis=1, inplace=True)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [64]:
# If keys == None, all keys will be used
def sum_histogram(
    keys: list[str]|None=None,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    dfs_for_histogram = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                if key in DFS and df_key != key:
                    continue
                df = remove_filename_column(df_key)
                dfs_for_histogram[df_key] = pd.DataFrame(df.sum(axis=0), columns=[df_key])
    sums = pd.concat(dfs_for_histogram.values(), join='outer', axis=1).fillna(0).astype(int)
    sums['sum'] = sums.sum(axis=1)
    sums.sort_values(by=['sum'], ascending=ascending, inplace=True)
    sums.drop("sum", axis=1, inplace=True)
    if percent:
        fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', histnorm='percent', width=width)
    else:
        fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', width=width)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ))
    display(fig)

In [65]:
# If keys == None, all keys will be used
def sum_categories_histogram(
    keys: list[str]|None=None,
    save_dfs: bool=False,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    cat_keys = []
    for key in keys:
        cat_key = f"{key}_categories"
        cat_keys.append(cat_key)
        add_dataframe(cat_key, divide_into_categories(key))
    sum_histogram(keys=cat_keys, percent=percent, ascending=ascending, width=width)
    if not save_dfs:
        for cat_key in cat_keys:
            remove_dataframe(cat_key)

In [66]:
# If keys == None, all keys will be used
def sum_groups_histogram(
    keys: list[str]|None=None,
    save_dfs: bool=False,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    group_keys = []
    for key in keys:
        group_key = f"{key}_groups"
        group_keys.append(group_key)
        add_dataframe(group_key, divide_into_groups(key))
    sum_histogram(keys=group_keys, percent=percent, ascending=ascending, width=width)
    if not save_dfs:
        for group_key in group_keys:
            remove_dataframe(group_key)

### Experiments

In [29]:
DFS['manjaro'][DFS['manjaro']['rex.X'] != 0]

Unnamed: 0,filename,endbr64,sub,test,call,add,ret,cs,push,mov,...,vcmpltpd,vcmplepd,vpcmpleuw,vpcmpnleb,vpcmpltb,vpcmpleb,vpmovsdb,vpmovsqb,vcvtsi2sdq,vfnmaddss
833,/usr/lib/libgcrypt.so,980,2875,5248,8392,12465,1439,307,3634,52170,...,0,0,0,0,0,0,0,0,0,0
849,/usr/lib/libgcrypt.so.20.4.1,980,2875,5248,8392,12465,1439,307,3634,52170,...,0,0,0,0,0,0,0,0,0,0
850,/usr/lib/libgnutls.so.30.34.3,2200,4152,11691,19461,7623,2732,848,6985,88001,...,0,0,0,0,0,0,0,0,0,0
942,/usr/lib/libcrypto.so,9127,11322,28552,9243,22427,10704,1933,22548,171721,...,0,0,0,0,0,0,0,0,0,0
2449,/usr/lib/chromium/chrome_crashpad_handler,6,3467,7451,8242,7807,2302,63,10791,66726,...,0,0,0,0,0,0,0,0,0,0
2450,/usr/lib/chromium/chromium,7,658584,1696574,1906313,1411518,366475,992,1847830,14425089,...,0,0,0,0,0,0,0,0,0,0
3069,/usr/bin/chromedriver,6,32367,84985,92257,72691,24310,228,114895,800113,...,0,0,0,0,0,0,0,0,0,0
3217,/usr/lib/libcrypto.so.3,9127,11322,28552,9243,22427,10704,1933,22548,171721,...,0,0,0,0,0,0,0,0,0,0
3254,/usr/lib/libgnutls.so,2200,4152,11691,19461,7623,2732,848,6985,88001,...,0,0,0,0,0,0,0,0,0,0
3309,/usr/lib/libgnutls.so.30,2200,4152,11691,19461,7623,2732,848,6985,88001,...,0,0,0,0,0,0,0,0,0,0


In [27]:
add_dataframe("man_cat", divide_into_categories("m"))

In [62]:
tmp = head("man_cat")

In [63]:
tmp = tmp.set_index("filename")
tmp = tmp.loc[list(set(tmp.index))]
tmp.insert(0, "filename", tmp.index)
tmp = tmp.set_index(pd.Index(i for i in range(len(tmp))))

In [65]:
tmp.loc[[]]

Unnamed: 0,filename,Other,Central Processing Unit (CPU) instructions set,Single Instruction Multiple Data (SIMD) instructions set,Transactional Synchronization Extensions (TSX) instructions set,Floating-Point Unit (FPU) instructions set,Memory Protection Extensions (MPX) instructions set,Advanced Encryption Standard (AES) instructions set,Virtual Machine Extensions (VMX) instructions set


In [43]:
headlist(set(DFS['man_cat'].filename))

['/usr/lib/xtables/libipt_realm.so',
 '/usr/bin/showwal',
 '/usr/lib/libevent_extra-2.1.so.7.0.1',
 '/usr/lib/libgdbm.so',
 '/usr/lib/libavahi-common.so.3',
 '/usr/lib/libQt5QuickWidgets.so.5',
 '/usr/lib/libbd_swap.so.2',
 '/usr/lib/libpamc.so.0.82.1',
 '/usr/lib/libgstallocators-1.0.so',
 '/usr/lib/bellagio/libomxaudio_effects.so.0',
 '/usr/bin/xmlcatalog',
 '/usr/bin/auditd',
 '/usr/lib/libts.so.0.10.4',
 '/usr/lib/libtheoraenc.so.1.1.2',
 '/usr/lib/qt/plugins/egldeviceintegrations/libqeglfs-kms-egldevice-integration.so',
 '/usr/bin/kglobalaccel5',
 '/usr/lib/libip6tc.so',
 '/usr/lib/libctf-nobfd.so.0.0.0',
 '/usr/lib/qt/qml/org/kde/sonnet/libsonnetquickplugin.so',
 '/usr/lib/gconv/ISO_2033.so',
 '/usr/lib/vlc/plugins/access/librtp_plugin.so',
 '/usr/bin/amrnb-enc',
 '/usr/lib/libbytesize.so.1.0.0',
 '/usr/lib/libuuid.so',
 '/DataCollection/venv/lib/python3.10/site-packages/pandas/_libs/tslibs/nattype.cpython-310-x86_64-linux-gnu.so',
 '/usr/lib/libxcb-xtest.so.0',
 '/usr/lib/libini

In [38]:
find_instruction("Advanced Encryption Standard (AES) instructions set", keys=['man_cat'], show=True);

man_cat:
	/usr/lib/libgcrypt.so
	/usr/lib/libgcrypt.so.20.4.1
	/usr/lib/libgnutls.so.30.34.3
	/usr/lib/libcrypto.so
	/usr/lib/chromium/chromium
	/usr/bin/chromedriver
	/usr/lib/libcrypto.so.3
	/usr/lib/libgnutls.so
	/usr/lib/libgnutls.so.30
	/usr/lib/libgcrypt.so.20
	/usr/lib/libfreeblpriv3.so



In [40]:
add_dataframe("tmp", DFS['ubu_cat'][DFS['ubu_cat']['filename'] == '/usr/lib/x86_64-linux-gnu/libcrypto.so.3'])

In [44]:
INSTRUCTIONS_INFO["je"]

KeyError: 'je'

In [68]:
sum_categories_histogram(percent=True, width=1000)

In [73]:
INSTRUCTIONS_INFO["repz"]

KeyError: 'repz'

In [25]:
sum_groups_histogram(percent=True, width=2000)

In [26]:
what_is_instruction("aad")

<IPython.core.display.Javascript object>