In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Convert FunctionSimSearch results                                         #
#                                                                            #
##############################################################################

In [2]:
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from tqdm import tqdm

In [3]:
def hamming_dist(n1, n2):
    x = n1 ^ n2
    setBits = 0

    while (x > 0):
        setBits += x & 1
        x >>= 1

    return setBits


def hamming_similarity_s(t1, t2):
    diff = hamming_dist(t1[0], t2[0]) + hamming_dist(t1[1], t2[1])
    return 1 - (diff / 128.0)


def compute_fss_similarity(df_input):
    scores = list()
    for idx, row in tqdm(df_input.iterrows()):
        score = hamming_similarity_s(tuple(row[['hashes0_1', 'hashes1_1']].values),
                                     tuple(row[['hashes0_2', 'hashes1_2']].values))
        scores.append(score)
    return scores

In [4]:
def compute_fuzzy_similarity(df_pairs, df_fss):
    df_pairs = df_pairs.merge(df_fss,
                              how='left',
                              left_on=['idb_path_1', 'fva_1'],
                              right_on=['path', 'address'])
    df_pairs.rename(columns={'hashes0': 'hashes0_1',
                             'hashes1': 'hashes1_1'
                             }, inplace=True)
    df_pairs.rename(columns={'time': 'fss_time_1'}, inplace=True)

    df_pairs = df_pairs.merge(df_fss,
                              how='left',
                              left_on=['idb_path_2', 'fva_2'],
                              right_on=['path', 'address'])
    df_pairs.rename(columns={'hashes0': 'hashes0_2',
                             'hashes1': 'hashes1_2'
                             }, inplace=True)
    df_pairs.rename(columns={'time': 'fss_time_2'}, inplace=True)

    df_pairs['sim'] = compute_fss_similarity(df_pairs)
    df_pairs = df_pairs[['idb_path_1','fva_1','idb_path_2','fva_2','sim']]
    return df_pairs

### Process Dataset-1 results

In [5]:
DB1_PATH = "../../DBs/Dataset-1/pairs/testing/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-1"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_pos = pd.read_csv(os.path.join(DB1_PATH, "pos_testing_Dataset-1.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB1_PATH, "neg_testing_Dataset-1.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB1_PATH, "pos_rank_testing_Dataset-1.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB1_PATH, "neg_rank_testing_Dataset-1.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_pos = compute_fuzzy_similarity(df_pos, df_fss)
    df_neg = compute_fuzzy_similarity(df_neg, df_fss)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_fss)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_fss)

    df_pos.to_csv("../data/Dataset-1/pos_testing_Dataset-1_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-1/neg_testing_Dataset-1_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-1/pos_rank_testing_Dataset-1_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-1/neg_rank_testing_Dataset-1_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


450000it [01:55, 3911.92it/s]
450000it [01:52, 3999.31it/s]
800it [00:00, 3970.95it/s]
80000it [00:20, 3977.39it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


450000it [01:53, 3959.27it/s]
450000it [01:52, 4017.14it/s]
800it [00:00, 4129.07it/s]
80000it [00:19, 4135.21it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


450000it [01:49, 4111.88it/s]
450000it [01:53, 3959.48it/s]
800it [00:00, 3920.85it/s]
80000it [00:20, 3909.31it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


450000it [01:53, 3961.70it/s]
450000it [01:51, 4030.62it/s]
800it [00:00, 4113.12it/s]
80000it [00:19, 4135.52it/s]


### Process Dataset-2 results

In [6]:
DB2_PATH = "../../DBs/Dataset-2/pairs/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-2"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_pos = pd.read_csv(os.path.join(DB2_PATH, "pos_testing_Dataset-2.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB2_PATH, "neg_testing_Dataset-2.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB2_PATH, "pos_rank_testing_Dataset-2.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB2_PATH, "neg_rank_testing_Dataset-2.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_pos = compute_fuzzy_similarity(df_pos, df_fss)
    df_neg = compute_fuzzy_similarity(df_neg, df_fss)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_fss)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_fss)

    df_pos.to_csv("../data/Dataset-2/pos_testing_Dataset-2_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-2/neg_testing_Dataset-2_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-2/pos_rank_testing_Dataset-2_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-2/neg_rank_testing_Dataset-2_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


150000it [00:36, 4127.75it/s]
150000it [00:36, 4143.63it/s]
600it [00:00, 4129.16it/s]
60000it [00:14, 4143.75it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


150000it [00:36, 4109.16it/s]
150000it [00:36, 4125.44it/s]
600it [00:00, 4072.02it/s]
60000it [00:14, 4126.63it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


150000it [00:36, 4131.54it/s]
150000it [00:36, 4118.87it/s]
600it [00:00, 4119.01it/s]
60000it [00:14, 4124.77it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


150000it [00:36, 4111.05it/s]
150000it [00:36, 4109.43it/s]
600it [00:00, 4101.77it/s]
60000it [00:14, 4120.22it/s]


### Process Dataset-Vulnerability results

In [7]:
!ls "../../DBs/Dataset-Vulnerability/pairs/"

pairs_testing_Dataset-Vulnerability.csv


In [8]:
DB2_PATH = "../../DBs/Dataset-Vulnerability/pairs/"
FSS_PATH = "../data/raw_results/FunctionSimSearch/Dataset-Vulnerability/"

for csv_name in [x for x in os.listdir(FSS_PATH) if x.endswith(".csv")]:
    csv_path = os.path.join(FSS_PATH, csv_name)
    print("[D] Processing {}".format(csv_name))

    df_testing = pd.read_csv(os.path.join(DB2_PATH, "pairs_testing_Dataset-Vulnerability.csv"), index_col=0)

    df_fss = pd.read_csv(csv_path)
    df_fss.drop(df_fss[df_fss['branching_nodes'] == 'branching_nodes'].index, inplace=True)
    df_fss.reset_index(inplace=True, drop=True)
    df_fss = df_fss.astype({'hashes0': np.uint64, 'hashes1': np.uint64})

    df_testing = compute_fuzzy_similarity(df_testing, df_fss)

    df_testing.to_csv("../data/Dataset-Vulnerability/testing_Dataset-Vulnerability_{}".format(csv_name), index=False)

[D] Processing IMM:0.00_MNEM:0.00_GRAPH:1.00.csv


88700it [00:21, 4111.52it/s]


[D] Processing IMM:0.00_MNEM:1.00_GRAPH:1.00.csv


88700it [00:21, 4104.74it/s]


[D] Processing IMM:4.00_MNEM:0.05_GRAPH:1.00.csv


88700it [00:21, 4111.57it/s]


[D] Processing IMM:1.00_MNEM:1.00_GRAPH:1.00.csv


88700it [00:21, 4098.62it/s]
