### Import necessary modules

In [42]:
import os
import zipfile
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from IPython.display import Javascript

In [43]:
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 

### Important constants (expected from user)

In [44]:
DATA_DIR = "firefox_data"

### Unpack archives and get tables

In [45]:
FILES_DIR = f"{DATA_DIR}_files"

In [46]:
DFS = dict()
for archive in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, archive)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(FILES_DIR)
    DFS[archive.split('_')[0]] = pd.read_csv(f"{os.path.join(FILES_DIR, archive)[:-3]}csv")

### Prepare paths to instruction pages

In [47]:
INSTRUCTION_PAGES = dict()
with open("x86doc/index.html") as file:
    index_text = BeautifulSoup(file)
entries = index_text.find_all("a")[1:]
for entry in entries:
    INSTRUCTION_PAGES[entry.get_text()] = entry["href"][1:]

### Define utility functions

Comments:
* All functions are able to work with the beginning of a key
* Key is usially OS

In [48]:
def head(
    key: str,
    number_of_rows: int=5,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df_head = DFS[df_key].head(number_of_rows)
            if show:
                print(f"{df_key}:")
                display(df_head)
            return df_head

In [49]:
# If keys == None, all keys will be used
def find_instruction(
    instruction: str,
    keys: None|list[str]=None,
    only_filenames: bool=True,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    if keys is None:
        keys = list(DFS)
    tables = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                try:
                    table = DFS[df_key][DFS[df_key][instruction] != 0]
                    if only_filenames:
                        table = list(table.filename)
                    tables[df_key] = table
                except KeyError:
                    tables[df_key] = None
    if show:
        for key in tables:
            print(f"{key}:")
            if tables[key] is None:
                continue
            if only_filenames:
                for filename in tables[key]:
                    print(f"\t{filename}")
            else:
                display(tables[key].head(number_of_rows_to_show))
            print()
            
    return tables

In [64]:
def what_is_instruction(
    instruction: str
):
    path = f"x86doc{INSTRUCTION_PAGES[instruction.upper()]}"
    display(Javascript('window.open("{url}");'.format(url=path)))

In [51]:
# Use this function, for example, to leave only files existing in a particular directory
def files_starting_with(
    key: str,
    beginning: str,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if i.startswith(beginning)]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df)
            return df

In [52]:
# Use this function, for example, to leave only row related to a particular executable file
def files_containing(
    key: str,
    substring: str,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if substring in i]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df)
            return df

In [53]:
def total_instruction_usage(
    key: str,
    to_dict: bool=False,
    show: bool=False,
):
    for df_key in DFS:
        if df_key.startswith(key):
            df = remove_filename_column(df_key)
            total = df.sum()
            total_dict = dict(total)
            if show:
                for instruction in total_dict:
                    print(f"{instruction}: {total_dict[instruction]}")
            if to_dict:
                return total_dict
            return total

In [54]:
def add_dataframe(
    name: str,
    dataframe: pd.DataFrame
):
    DFS[name] = dataframe

In [55]:
def remove_dataframe(
    key: str
):
    for df_key in DFS:
        if df_key.startswith(key):
            DFS.pop(df_key)
            return

In [56]:
def add_dataframe(
    name: str,
    dataframe: pd.DataFrame
):
    DFS[name] = dataframe

In [57]:
def remove_filename_column(
    key: str,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df = DFS[df_key].drop("filename", axis=1)
            if show:
                display(df)
            return df

In [58]:
# If keys == None, all keys will be used
def sum_percent_histogram(
    keys=None,
    ascending=False,
    width=30000
):
    if keys is None:
        keys = list(DFS)
    dfs_for_histogram = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                df = remove_filename_column(df_key)
                dfs_for_histogram[df_key] = pd.DataFrame(df.sum(axis=0), columns=[df_key])
    sums = pd.concat(dfs_for_histogram.values(), join='outer', axis=1).fillna(0).astype(int)
    sums['sum'] = sums.sum(axis=1)
    sums.sort_values(by=['sum'], ascending=ascending, inplace=True)
    sums.drop("sum", axis=1, inplace=True)
    fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', histnorm='percent', width=width)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ))
    display(fig)

In [59]:
# If keys == None, all keys will be used
def sum_histogram(
    keys=None,
    ascending=False,
    width=30000
):
    if keys is None:
        keys = list(DFS)
    dfs_for_histogram = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                df = remove_filename_column(df_key)
                dfs_for_histogram[df_key] = pd.DataFrame(df.sum(axis=0), columns=[df_key])
    sums = pd.concat(dfs_for_histogram.values(), join='outer', axis=1).fillna(0).astype(int)
    sums['sum'] = sums.sum(axis=1)
    sums.sort_values(by=['sum'], ascending=ascending, inplace=True)
    sums.drop("sum", axis=1, inplace=True)
    fig = px.histogram(sums, x=sums.index, y=sums.columns, barmode='group', width=width)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ))
    display(fig)

### Experiments