In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Convert Catalog1 results                                                  #
#                                                                            #
##############################################################################

In [2]:
import numpy as np
import os
import pandas as pd

from sklearn import metrics

In [3]:
def jaccard_similarity(ff1, ff2):
    return len(set(ff1) & set(ff2)) / len(set(ff1) | set(ff2))


def compute_catalog1_similarity(df_input):
    scores = list()
    for idx, row in df_input.iterrows():
        score = jaccard_similarity(row['catalog_1'].split(";"),
                                   row['catalog_2'].split(";"))
        scores.append(score)
    return scores


def compute_fuzzy_similarity(df_input, df_catalog):
    df_input = df_input.merge(df_catalog,
                              how='left',
                              left_on=['idb_path_1', 'fva_1'],
                              right_on=['path', 'address'])

    df_input.rename(columns={'catalog_hash_list': 'catalog_1'}, inplace=True)
    df_input.rename(columns={'time': 'catalog_time_1'}, inplace=True)

    df_input = df_input.merge(df_catalog,
                              how='left',
                              left_on=['idb_path_2', 'fva_2'],
                              right_on=['path', 'address'])

    df_input.rename(columns={'catalog_hash_list': 'catalog_2'}, inplace=True)
    df_input.rename(columns={'time': 'catalog_time_2'}, inplace=True)

    df_input['sim'] = compute_catalog1_similarity(df_input)
    df_input = df_input[['idb_path_1','fva_1','idb_path_2','fva_2','sim']]
    return df_input

### Process Dataset-1 results

In [4]:
DB1_PATH = "../../DBs/Dataset-1/pairs/testing/"
CATALOG1_PATH = "../data/raw_results/Catalog1/Dataset-1"

for csv_name in os.listdir(CATALOG1_PATH):
    if not csv_name.endswith(".csv"):
        continue
    csv_path = os.path.join(CATALOG1_PATH, csv_name)
    print("[D] Processing {}".format(csv_path))
    
    df_catalog = pd.read_csv(csv_path)
    df_catalog.drop(df_catalog[df_catalog['catalog_hash_list'] == 'catalog_hash_list'].index, inplace=True)
    df_catalog.reset_index(inplace=True, drop=True)
    
    df_pos = pd.read_csv(os.path.join(DB1_PATH, "pos_testing_Dataset-1.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB1_PATH, "neg_testing_Dataset-1.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB1_PATH, "pos_rank_testing_Dataset-1.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB1_PATH, "neg_rank_testing_Dataset-1.csv"), index_col=0)
    
    df_pos = compute_fuzzy_similarity(df_pos, df_catalog)
    df_neg = compute_fuzzy_similarity(df_neg, df_catalog)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_catalog)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_catalog)
        
    df_pos.to_csv("../data/Dataset-1/pos_testing_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-1/neg_testing_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-1/pos_rank_testing_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-1/neg_rank_testing_{}".format(csv_name), index=False)

[D] Processing ../data/raw_results/Catalog1/Dataset-1/Dataset-1_catalog1_16.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-1/Dataset-1_catalog1_64.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-1/Dataset-1_catalog1_128.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-1/Dataset-1_catalog1_32.csv


### Process Dataset-2 results

In [5]:
DB2_PATH = "../../DBs/Dataset-2/pairs/"
CATALOG1_PATH = "../data/raw_results/Catalog1/Dataset-2"

for csv_name in os.listdir(CATALOG1_PATH):
    if not csv_name.endswith(".csv"):
        continue
    csv_path = os.path.join(CATALOG1_PATH, csv_name)
    print("[D] Processing {}".format(csv_path))
    
    df_catalog = pd.read_csv(csv_path)
    df_catalog.drop(df_catalog[df_catalog['catalog_hash_list'] == 'catalog_hash_list'].index, inplace=True)
    df_catalog.reset_index(inplace=True, drop=True)
    
    df_pos = pd.read_csv(os.path.join(DB2_PATH, "pos_testing_Dataset-2.csv"), index_col=0)
    df_neg = pd.read_csv(os.path.join(DB2_PATH, "neg_testing_Dataset-2.csv"), index_col=0)
    df_pos_rank = pd.read_csv(os.path.join(DB2_PATH, "pos_rank_testing_Dataset-2.csv"), index_col=0)
    df_neg_rank = pd.read_csv(os.path.join(DB2_PATH, "neg_rank_testing_Dataset-2.csv"), index_col=0)
    
    df_pos = compute_fuzzy_similarity(df_pos, df_catalog)
    df_neg = compute_fuzzy_similarity(df_neg, df_catalog)
    df_pos_rank = compute_fuzzy_similarity(df_pos_rank, df_catalog)
    df_neg_rank = compute_fuzzy_similarity(df_neg_rank, df_catalog)
        
    df_pos.to_csv("../data/Dataset-2/pos_testing_{}".format(csv_name), index=False)
    df_neg.to_csv("../data/Dataset-2/neg_testing_{}".format(csv_name), index=False)
    df_pos_rank.to_csv("../data/Dataset-2/pos_rank_testing_{}".format(csv_name), index=False)
    df_neg_rank.to_csv("../data/Dataset-2/neg_rank_testing_{}".format(csv_name), index=False)

[D] Processing ../data/raw_results/Catalog1/Dataset-2/Dataset-2_catalog1_128.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-2/Dataset-2_catalog1_32.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-2/Dataset-2_catalog1_16.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-2/Dataset-2_catalog1_64.csv


### Process Dataset-Vulnerability results

In [6]:
DB2_PATH = "../../DBs/Dataset-Vulnerability/pairs/"
CATALOG1_PATH = "../data/raw_results/Catalog1/Dataset-Vulnerability/"

for csv_name in os.listdir(CATALOG1_PATH):
    if not csv_name.endswith(".csv"):
        continue
    csv_path = os.path.join(CATALOG1_PATH, csv_name)
    print("[D] Processing {}".format(csv_path))
    
    df_catalog = pd.read_csv(csv_path)
    df_catalog.drop(df_catalog[df_catalog['catalog_hash_list'] == 'catalog_hash_list'].index, inplace=True)
    df_catalog.reset_index(inplace=True, drop=True)
    
    df_testing = pd.read_csv(os.path.join(DB2_PATH, "pairs_testing_Dataset-Vulnerability.csv"), index_col=0)
    
    df_testing = compute_fuzzy_similarity(df_testing, df_catalog)
        
    df_testing.to_csv("../data/Dataset-Vulnerability/testing_{}".format(csv_name), index=False)

[D] Processing ../data/raw_results/Catalog1/Dataset-Vulnerability/Dataset-Vulnerability_catalog1_128.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-Vulnerability/Dataset-Vulnerability_catalog1_32.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-Vulnerability/Dataset-Vulnerability_catalog1_16.csv
[D] Processing ../data/raw_results/Catalog1/Dataset-Vulnerability/Dataset-Vulnerability_catalog1_64.csv
