In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Convert CodeCMR results                                                   #
#                                                                            #
##############################################################################

In [2]:
import numpy as np
import os
import pandas as pd

from scipy.spatial.distance import cosine
from tqdm import tqdm

In [3]:
def cosine_similarity(e1, e2):
    return 1 - cosine(e1, e2)

def from_str_to_numpy_array(input_str):
    return np.fromstring(input_str[1:-1], dtype=float, sep=', ')

In [4]:
def compute_cosine_similarity(df_input):
    sim_list = list()
    for idx, row in tqdm(df_input.iterrows()):

        if row['embeddings_1'] is np.nan or \
                row['embeddings_2'] is np.nan:
            print("[!] Missing value in (idx:{})".format(idx))
            sim_list.append(0)
            continue

        e1 = from_str_to_numpy_array(row['embeddings_1'])
        e2 = from_str_to_numpy_array(row['embeddings_2'])
        sim_list.append(cosine_similarity(e1, e2))
    return sim_list

In [5]:
def compute_embedding_similarity(df_pairs, df_emb):
    df_pairs = df_pairs.merge(df_emb,
                              how='left',
                              left_on=['idb_path_1', 'fva_1'],
                              right_on=['idb_path', 'fva'])
    df_pairs.rename(columns={'embedding': 'embeddings_1'}, inplace=True)
    
    df_pairs = df_pairs.merge(df_emb,
                              how='left',
                              left_on=['idb_path_2', 'fva_2'],
                              right_on=['idb_path', 'fva'])
    df_pairs.rename(columns={'embedding': 'embeddings_2'}, inplace=True)
    
    df_pairs['sim'] = compute_cosine_similarity(df_pairs)
    df_pairs = df_pairs[['idb_path_1','fva_1','idb_path_2','fva_2','sim']]
    return df_pairs

In [6]:
DB1_PATH = "../../DBs/Dataset-1-CodeCMR/pairs/testing/"
df_emb = pd.read_csv("../data/raw_results/CodeCMR/Dataset-1-CodeCMR.csv")

df_pos = pd.read_csv(os.path.join(DB1_PATH, "pos_testing_Dataset-1-CodeCMR.csv"), index_col=0)
df_neg = pd.read_csv(os.path.join(DB1_PATH, "neg_testing_Dataset-1-CodeCMR.csv"), index_col=0)
df_pos_rank = pd.read_csv(os.path.join(DB1_PATH, "pos_rank_testing_Dataset-1-CodeCMR.csv"), index_col=0)
df_neg_rank = pd.read_csv(os.path.join(DB1_PATH, "neg_rank_testing_Dataset-1-CodeCMR.csv"), index_col=0)

df_pos = compute_embedding_similarity(df_pos, df_emb)
df_neg = compute_embedding_similarity(df_neg, df_emb)
df_pos_rank = compute_embedding_similarity(df_pos_rank, df_emb)
df_neg_rank = compute_embedding_similarity(df_neg_rank, df_emb)

df_pos.to_csv("../data/Dataset-1-CodeCMR/pos_testing_Dataset-1-CodeCMR.csv", index=False)
df_neg.to_csv("../data/Dataset-1-CodeCMR/neg_testing_Dataset-1-CodeCMR.csv", index=False)
df_pos_rank.to_csv("../data/Dataset-1-CodeCMR/pos_rank_testing_Dataset-1-CodeCMR.csv", index=False)
df_neg_rank.to_csv("../data/Dataset-1-CodeCMR/neg_rank_testing_Dataset-1-CodeCMR.csv", index=False)

200000it [00:24, 8021.55it/s]
200000it [00:24, 8028.75it/s]
800it [00:00, 7859.95it/s]
80000it [00:09, 8008.25it/s]
