In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Convert FunctionSimSearch results                                         #
#                                                                            #
##############################################################################

In [2]:
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from tqdm import tqdm

In [3]:
def hamming_dist(n1, n2):
    x = n1 ^ n2
    setBits = 0

    while (x > 0):
        setBits += x & 1
        x >>= 1

    return setBits


def hamming_similarity_s(t1, t2):
    diff = hamming_dist(t1[0], t2[0]) + hamming_dist(t1[1], t2[1])
    return 1 - (diff / 128.0)


def compute_fss_similarity(df_input):
    scores = list()
    for idx, row in tqdm(df_input.iterrows()):
        score = hamming_similarity_s(tuple(row[['hashes0_1', 'hashes1_1']].values),
                                     tuple(row[['hashes0_2', 'hashes1_2']].values))
        scores.append(score)
    return scores

In [4]:
def compute_fuzzy_similarity(df_pairs, df_fss, is_pos):
    df_pairs = df_pairs.merge(df_fss,
                              how='left',
                              left_on=['idb_path_1', 'fva_1'],
                              right_on=['path', 'address'])
    del df_pairs['path']
    del df_pairs['address']
    del df_pairs['num_nodes']
    del df_pairs['branching_nodes']
    df_pairs.rename(columns={'hashes0': 'hashes0_1',
                             'hashes1': 'hashes1_1'
                             }, inplace=True)
    df_pairs.rename(columns={'time': 'fss_time_1'}, inplace=True)

    df_pairs = df_pairs.merge(df_fss,
                              how='left',
                              left_on=['idb_path_2', 'fva_2'],
                              right_on=['path', 'address'])
    del df_pairs['path']
    del df_pairs['address']
    del df_pairs['num_nodes']
    del df_pairs['branching_nodes']
    df_pairs.rename(columns={'hashes0': 'hashes0_2',
                             'hashes1': 'hashes1_2'
                             }, inplace=True)
    df_pairs.rename(columns={'time': 'fss_time_2'}, inplace=True)

    df_pairs['sim'] = compute_fss_similarity(df_pairs)

    del df_pairs['hashes0_1']
    del df_pairs['hashes1_1']
    del df_pairs['hashes0_2']
    del df_pairs['hashes1_2']
    return df_pairs

### Process Dataset-1 results

In [5]:
DB1_PATH = "../../DBs/Dataset-1/pairs/testing/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-1"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_pos = pd.read_csv(os.path.join(DB1_PATH, "pos_testing_Dataset-1.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB1_PATH, "neg_testing_Dataset-1.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB1_PATH, "pos_rank_testing_Dataset-1.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB1_PATH, "neg_rank_testing_Dataset-1.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_pos = compute_fuzzy_similarity(df_pos, df_fss, is_pos=True)
    df_neg = compute_fuzzy_similarity(df_neg, df_fss, is_pos=False)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_fss, is_pos=True)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_fss, is_pos=False)

    df_pos.to_csv("../data/Dataset-1/pos_testing_Dataset-1_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-1/neg_testing_Dataset-1_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-1/pos_rank_testing_Dataset-1_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-1/neg_rank_testing_Dataset-1_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


450000it [01:50, 4054.85it/s]
450000it [01:51, 4029.89it/s]
800it [00:00, 4107.45it/s]
80000it [00:19, 4097.15it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


450000it [01:49, 4105.81it/s]
450000it [01:49, 4093.81it/s]
800it [00:00, 4116.73it/s]
80000it [00:19, 4095.60it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


450000it [01:50, 4070.05it/s]
450000it [01:49, 4092.46it/s]
800it [00:00, 4059.93it/s]
80000it [00:19, 4086.01it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


450000it [01:50, 4082.72it/s]
450000it [01:51, 4030.76it/s]
800it [00:00, 3937.81it/s]
80000it [00:19, 4042.33it/s]


### Process Dataset-2 results

In [5]:
DB2_PATH = "../../DBs/Dataset-2/pairs/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-2"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_pos = pd.read_csv(os.path.join(DB2_PATH, "pos_testing_Dataset-2.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB2_PATH, "neg_testing_Dataset-2.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB2_PATH, "pos_rank_testing_Dataset-2.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB2_PATH, "neg_rank_testing_Dataset-2.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_pos = compute_fuzzy_similarity(df_pos, df_fss, is_pos=True)
    df_neg = compute_fuzzy_similarity(df_neg, df_fss, is_pos=False)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_fss, is_pos=True)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_fss, is_pos=False)

    df_pos.to_csv("../data/Dataset-2/pos_testing_Dataset-2_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-2/neg_testing_Dataset-2_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-2/pos_rank_testing_Dataset-2_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-2/neg_rank_testing_Dataset-2_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


150000it [00:38, 3933.27it/s]
150000it [00:37, 4043.75it/s]
600it [00:00, 4009.92it/s]
60000it [00:15, 3940.88it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


150000it [00:37, 3976.29it/s]
150000it [00:37, 4032.24it/s]
600it [00:00, 4045.34it/s]
60000it [00:14, 4053.11it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


150000it [00:37, 4033.75it/s]
150000it [00:37, 3986.01it/s]
600it [00:00, 3672.56it/s]
60000it [00:15, 3959.20it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


150000it [00:37, 4020.47it/s]
150000it [00:37, 4004.49it/s]
600it [00:00, 3784.51it/s]
60000it [00:15, 3986.59it/s]


### Process Dataset-Vulnerability results

In [6]:
!ls "../../DBs/Dataset-Vulnerability/pairs/"

pairs_testing_Dataset-Vulnerability.csv


In [7]:
DB2_PATH = "../../DBs/Dataset-Vulnerability/pairs/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-Vulnerability/"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_testing = pd.read_csv(os.path.join(DB2_PATH, "pairs_testing_Dataset-Vulnerability.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_testing = compute_fuzzy_similarity(df_testing, df_fss, is_pos=True)

    df_testing.to_csv("../data/Dataset-Vulnerability/testing_Dataset-Vulnerability_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


88700it [00:21, 4039.46it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


88700it [00:22, 4018.47it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


88700it [00:22, 4000.12it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


88700it [00:22, 3947.73it/s]
