### Import necessary modules

In [1]:
import os
import json
import zipfile
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from IPython.display import Javascript

In [2]:
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 

### Important constants (expected from user)

In [3]:
DATA_DIR = "11.03"
INSTRUCTIONS_INFO_FILE = "x86-64_instructions.json"

### Unpack archives and get tables

In [4]:
FILES_DIR = f"{DATA_DIR}_files"

In [5]:
DFS = dict()
for archive in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, archive)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(FILES_DIR)
    DFS[archive.split('_')[0]] = pd.read_csv(f"{os.path.join(FILES_DIR, archive)[:-3]}csv")

### Prepare instructions data

In [6]:
INSTRUCTION_PAGES = dict()
with open("x86doc/index.html") as file:
    index_text = BeautifulSoup(file)
entries = index_text.find_all("a")[1:]
for entry in entries:
    INSTRUCTION_PAGES[entry.get_text()] = entry["href"][1:]

In [7]:
INSTRUCTIONS_INFO = dict()
with open(INSTRUCTIONS_INFO_FILE, "r") as read_file:
    instructions_info = json.load(read_file)["instructions"]
for item in instructions_info:
    INSTRUCTIONS_INFO[item["instruction"]] = {"category": item["category"],
                                              "group": item["group"],
                                              "description": item["description"]}

### Define utility functions

Comments:
* All functions are able to work with the beginning of a key
* Key is usially OS

In [8]:
def add_dataframe(
    name: str,
    dataframe: pd.DataFrame
):
    DFS[name] = dataframe

In [9]:
def get_dataframe(
    key: str
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            return DFS[df_key]

In [10]:
def remove_dataframe(
    key: str
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            DFS.pop(df_key)
            return

In [11]:
def remove_filename_column(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].drop("filename", axis=1)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [12]:
def head(
    key: str,
    number_of_rows: int=5,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df_head = DFS[df_key].head(number_of_rows)
            if show:
                print(f"{df_key}:")
                display(df_head)
            return df_head

In [13]:
# If keys == None, all keys will be used
def find_instruction(
    instruction: str,
    keys: None|list[str]=None,
    only_filenames: bool=True,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    if keys is None:
        keys = list(DFS)
    tables = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                if key in DFS and df_key != key:
                    continue
                try:
                    table = DFS[df_key][DFS[df_key][instruction] != 0]
                    if only_filenames:
                        table = list(table.filename)
                    tables[df_key] = table
                except KeyError:
                    tables[df_key] = None
    if show:
        for key in tables:
            print(f"{key}:")
            if tables[key] is None:
                continue
            if only_filenames:
                for filename in tables[key]:
                    print(f"\t{filename}")
            else:
                display(tables[key].head(number_of_rows_to_show))
            print()
            
    return tables

In [14]:
def what_is_instruction(
    instruction: str
):
    try:
        path = f"x86doc{INSTRUCTION_PAGES[instruction.upper()]}"
        display(Javascript('window.open("{url}");'.format(url=path)))
    except:
        print("Instruction is not found.")

In [15]:
# Use this function, for example, to leave only files existing in a particular directory
def files_starting_with(
    key: str,
    beginning: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if i.startswith(beginning)]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [16]:
# Use this function, for example, to leave only row related to a particular executable file
def files_containing(
    key: str,
    substring: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if substring in i]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [17]:
def total_instruction_usage(
    key: str,
    to_dict: bool=False,
    show: bool=False,
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = remove_filename_column(df_key)
            total = df.sum()
            total_dict = dict(total)
            if show:
                for instruction in total_dict:
                    print(f"{instruction}: {total_dict[instruction]}")
            if to_dict:
                return total_dict
            return total

In [18]:
def divide_into_categories(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].copy()
            columns = list(df.columns)
            for column in columns:
                column_upper = column.upper()
                if column == "filename":
                    continue
                if column_upper in INSTRUCTIONS_INFO:
                    category = INSTRUCTIONS_INFO[column_upper]["category"]
                elif column_upper[:-1] in INSTRUCTIONS_INFO:
                    category = INSTRUCTIONS_INFO[column_upper[:-1]]["category"]
                else:
                    category = "Other"
                if category not in df.columns:
                    df[category] = df[column]
                else:
                    df[category] += df[column]
                df.drop(column, axis=1, inplace=True)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [19]:
def divide_into_groups(
    key: str,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    for df_key in DFS:
        if df_key.startswith(key):
            if key in DFS and df_key != key:
                continue
            df = DFS[df_key].copy()
            columns = list(df.columns)
            for column in columns:
                column_upper = column.upper()
                if column == "filename":
                    continue
                if column_upper in INSTRUCTIONS_INFO:
                    group = INSTRUCTIONS_INFO[column_upper]["group"]
                elif column_upper[:-1] in INSTRUCTIONS_INFO:
                    group = INSTRUCTIONS_INFO[column_upper[:-1]]["group"]
                else:
                    group = "Other"
                if group not in df.columns:
                    df[group] = df[column]
                else:
                    df[group] += df[column]
                df.drop(column, axis=1, inplace=True)
            if show:
                display(df.head(number_of_rows_to_show))
            return df

In [20]:
# If keys == None, all keys will be used
def sum_histogram(
    keys: list[str]|None=None,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    dfs_for_histogram = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                if key in DFS and df_key != key:
                    continue
                df = remove_filename_column(df_key)
                dfs_for_histogram[df_key] = pd.DataFrame(df.sum(axis=0), columns=[df_key])
    sums = pd.concat(dfs_for_histogram.values(), join='outer', axis=1).fillna(0).astype(int)
    sums['sum'] = sums.sum(axis=1)
    sums.sort_values(by=['sum'], ascending=ascending, inplace=True)
    sums.drop("sum", axis=1, inplace=True)
    if percent:
        fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', histnorm='percent', width=width)
    else:
        fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', width=width)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ))
    display(fig)

In [21]:
# If keys == None, all keys will be used
def sum_categories_histogram(
    keys: list[str]|None=None,
    save_dfs: bool=False,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    cat_keys = []
    for key in keys:
        cat_key = f"{key}_categories"
        cat_keys.append(cat_key)
        add_dataframe(cat_key, divide_into_categories(key))
    sum_histogram(keys=cat_keys, percent=percent, ascending=ascending, width=width)
    if not save_dfs:
        for cat_key in cat_keys:
            remove_dataframe(cat_key)

In [22]:
# If keys == None, all keys will be used
def sum_groups_histogram(
    keys: list[str]|None=None,
    save_dfs: bool=False,
    percent: bool=False,
    ascending: bool=False,
    width: int=30000
):
    if keys is None:
        keys = list(DFS)
    group_keys = []
    for key in keys:
        group_key = f"{key}_groups"
        group_keys.append(group_key)
        add_dataframe(group_key, divide_into_groups(key))
    sum_histogram(keys=group_keys, percent=percent, ascending=ascending, width=width)
    if not save_dfs:
        for group_key in group_keys:
            remove_dataframe(group_key)

### Experiments

In [23]:
sum_categories_histogram(percent=True, width=1000)

In [24]:
sum_groups_histogram(percent=True, width=2000)