### Import necessary modules

In [194]:
import os
import zipfile
import pandas as pd
from IPython.display import display
from IPython.display import Javascript
import requests

### Important constants (expected from user)

In [195]:
DATA_DIR = "firefox_data"

### Unpack archives and get tables

In [196]:
FILES_DIR = f"{DATA_DIR}_files"

In [197]:
DFS = dict()
for archive in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, archive)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(FILES_DIR)
    DFS[archive.split('_')[0]] = pd.read_csv(f"{os.path.join(FILES_DIR, archive)[:-3]}csv")

### Define utility functions

Comments:
* All functions are able to work with the beginning of a key
* Key is usially OS

In [198]:
def head(
    key: str,
    number_of_rows: int=5,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df_head = DFS[df_key].head(number_of_rows)
            if show:
                display(df_head)
            return df_head

In [199]:
# If keys == None, all keys will be used
def find_instruction(
    instruction: str,
    keys: None|list[str]=None,
    only_filenames: bool=True,
    show: bool=False,
    number_of_rows_to_show: int=5
):
    if keys is None:
        keys = list(DFS)
    tables = dict()
    for key in keys:
        for df_key in DFS:
            if df_key.startswith(key):
                try:
                    table = DFS[df_key][DFS[df_key][instruction] != 0]
                    if only_filenames:
                        table = list(table.filename)
                    tables[df_key] = table
                except KeyError:
                    tables[df_key] = None
    if show:
        for key in tables:
            print(f"{key}:")
            if tables[key] is None:
                continue
            if only_filenames:
                for filename in tables[key]:
                    print(f"\t{filename}")
            else:
                display(tables[key].head(number_of_rows_to_show))
            print()
            
    return tables

In [200]:
def what_is_instruction(
    instruction: str
):
    page_text = requests.get('https://www.felixcloutier.com/x86/').text
    tmp = page_text[:(page_text.find(f'>{instruction.upper()}<') - 1)]
    path_part = tmp[-(tmp[::-1].index('"') - 1):]
    display(Javascript('window.open("{url}");'.format(url=f"https://www.felixcloutier.com/x86/{path_part}")))

In [250]:
# Use this function, for example, to leave only files existing in a particular directory
def files_starting_with(
    key: str,
    beginning: str,
    show: bool=False
):
    for df_key in DFS:
        if df_key.startswith(key):
            df = DFS[df_key].set_index("filename")
            df = df.loc[[i for i in df.index if i.startswith(beginning)]]
            df.insert(0, "filename", df.index)
            df = df.set_index(pd.Index(i for i in range(len(df))))
            if show:
                display(df)
            return df

### Experiments

In [251]:
files_starting_with('o', '/usr/bin')

Unnamed: 0,filename,endbr64,sub,mov,test,je,call,add,ret,push,...,vfnmsub132pd,vmovntdqa,vfnmsub132ps,vmovntps,vmovntpd,vcvtpd2dqy,pmaxsb,pminsb,cvtpd2dq,movntpd
0,/usr/bin/users,6,32,801,104,128,160,79,21,127,...,0,0,0,0,0,0,0,0,0,0
1,/usr/bin/csplit,6,93,1525,204,207,404,125,44,197,...,0,0,0,0,0,0,0,0,0,0
2,/usr/bin/cksum,6,148,7066,245,285,428,2645,57,284,...,0,0,0,0,0,0,0,0,0,0
3,/usr/bin/nl,6,45,1068,132,149,226,87,27,133,...,0,0,0,0,0,0,0,0,0,0
4,/usr/bin/ptx,6,184,2385,347,430,515,257,49,209,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,/usr/bin/newuidmap,6,31,804,157,152,307,40,28,134,...,0,0,0,0,0,0,0,0,0,0
280,/usr/bin/gtk-update-icon-cache,5,46,968,183,156,354,70,31,157,...,0,0,0,0,0,0,0,0,0,0
281,/usr/bin/ar,5,77,1851,249,247,583,110,39,248,...,0,0,0,0,0,0,0,0,0,0
282,/usr/bin/size,5,26,529,60,78,196,46,19,98,...,0,0,0,0,0,0,0,0,0,0


In [233]:
find_instruction('aad', show=True);

manjaro:
	/DataCollection/venv/lib/python3.10/site-packages/pip/_vendor/distlib/t32.exe

opensuse:
	/usr/lib/python3.10/site-packages/pip/_vendor/distlib/t32.exe
	/DataCollection/venv/lib/python3.10/site-packages/pip/_vendor/distlib/t32.exe

ubuntu:


In [129]:
list(DFS['manjaro'].sum().sort_values().index)

['kxorb',
 'aad',
 'mulw',
 'vmaskmovpd',
 'vcmptrueps',
 'pmovsxbd',
 'korb',
 'jb,pt',
 'lsl',
 'rorq',
 'into',
 'lfs',
 'rcrw',
 'loop,pn',
 'cmpsq',
 'jns,pt',
 'kandd',
 'loope,pt',
 'lar',
 'jnp,pt',
 'jrcxz,pt',
 'ftst',
 'rdpmc',
 'bndstx',
 'insw',
 'str',
 'wbinvd',
 'fndisi(8087',
 'popfw',
 'jbe,pt',
 'xrelease',
 'retw',
 'frstpm(287',
 'jb,pn',
 'pushw',
 'loopne,pt',
 'cvtps2pi',
 'xcrypt-ofb',
 'xadd',
 'jo,pt',
 'ud0',
 'jns,pn',
 'movntpd',
 'vbroadcastf32x2',
 'vcvtpd2dqy',
 'fldl2t',
 'fprem1',
 'wrpkru',
 'crc32',
 'vfnmsub132pd',
 'vcmpnge_uqpd',
 'vcmpnge_uqps',
 'hsubps',
 'vfnmsub132ps',
 'vcvtsi2ssq',
 'fclex',
 'vmovntpd',
 'sgdt',
 'rdtscp',
 'ud1',
 'kxorq',
 'leavew',
 'vpcmpnleb',
 'xstore-rng',
 'vsqrtpd',
 'pushfw',
 'fnop',
 'ja,pn',
 'rdpkru',
 'lgs',
 'wrmsr',
 'fldlg2',
 'crc32w',
 'vmwrite',
 'jge,pt',
 'lretq',
 'rdmsr',
 'vmread',
 'crc32l',
 'fxrstor',
 'xsave',
 'xsavec',
 'fxsave',
 'pmaxsb',
 'pminsb',
 'knotb',
 'vpmovw2m',
 'vpmovm2w',
 'l

In [161]:
what_is_instruction('ret')

<IPython.core.display.Javascript object>