In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-1-CodeCMR sanity check                                            #
#                                                                            #
##############################################################################

In [2]:
## Requirements
# pandas==1.4.2

In [3]:
import json
import os
import pandas as pd
import pickle
import unittest

In [4]:
N_SELECTED_FUNCS_TRAINING = 181907
N_SELECTED_BINARIES_TRAINING = 2114

N_SELECTED_FUNCS_VALIDATION = 8233
N_SELECTED_BINARIES_VALIDATION = 562

N_SELECTED_FUNCS_TESTING = 286950
N_SELECTED_BINARIES_TESTING = 600

In [5]:
!ls .

Dataset-1-CodeCMR example.ipynb      [34mpairs[m[m
Dataset-1-CodeCMR plots.ipynb        testing_Dataset-1-CodeCMR.csv
Dataset-1-CodeCMR sanity check.ipynb training_Dataset-1-CodeCMR.csv
[34mfeatures[m[m                             validation_Dataset-1-CodeCMR.csv


In [6]:
class TestDataset1(unittest.TestCase):
    
    def setUp(self):
        training_dataset = pd.read_csv("training_Dataset-1-CodeCMR.csv", index_col=0)
        self.fva_set_training = set([tuple(x) for x in training_dataset[['idb_path', 'fva']].values])
        self.idb_set_training = set(training_dataset['idb_path'].values)
        self.pickle_set_training = set(training_dataset['pickle_path'].values)
        
        validation_dataset = pd.read_csv("validation_Dataset-1-CodeCMR.csv", index_col=0)
        self.fva_set_validation = set([tuple(x) for x in validation_dataset[['idb_path', 'fva']].values])
        self.idb_set_validation = set(validation_dataset['idb_path'].values)
        self.pickle_set_validation = set(validation_dataset['pickle_path'].values)

        testing_dataset = pd.read_csv("testing_Dataset-1-CodeCMR.csv", index_col=0)
        self.fva_set_testing = set([tuple(x) for x in testing_dataset[['idb_path', 'fva']].values])
        self.idb_set_testing = set(testing_dataset['idb_path'].values)
        self.pickle_set_testing = set(testing_dataset['pickle_path'].values)
    
    def test_training_dataset(self):
        training_dataset = pd.read_csv("training_Dataset-1-CodeCMR.csv", index_col=0)

        # Test null values
        self.assertFalse(training_dataset.isnull().values.any())

        # Test number of unique functions
        self.assertEqual(training_dataset.shape[0], N_SELECTED_FUNCS_TRAINING)
        fva_set_training = set([tuple(x) for x in training_dataset[['idb_path', 'fva']].values])
        self.assertEqual(len(fva_set_training), N_SELECTED_FUNCS_TRAINING)

        # Test the number of unique binaries
        idb_set_training = set(training_dataset['idb_path'].values)
        self.assertEqual(len(idb_set_training), N_SELECTED_BINARIES_TRAINING)
    
    def test_validation_dataset(self):
        validation_dataset = pd.read_csv("validation_Dataset-1-CodeCMR.csv", index_col=0)
        
        # Test null values
        self.assertFalse(validation_dataset.isnull().values.any())
        
        # Test number of unique functions
        self.assertEqual(validation_dataset.shape[0], N_SELECTED_FUNCS_VALIDATION)
        fva_set_validation = set([tuple(x) for x in validation_dataset[['idb_path', 'fva']].values])
        self.assertEqual(len(fva_set_validation), N_SELECTED_FUNCS_VALIDATION)
        
        # Test the number of unique binaries
        idb_set_validation = set(validation_dataset['idb_path'].values)
        self.assertEqual(len(idb_set_validation), N_SELECTED_BINARIES_VALIDATION)
    
    def test_testing_dataset(self):
        testing_dataset = pd.read_csv("testing_Dataset-1-CodeCMR.csv", index_col=0)

        # Test null values
        self.assertFalse(testing_dataset.isnull().values.any())

        # Test number of unique functions
        self.assertEqual(testing_dataset.shape[0], N_SELECTED_FUNCS_TESTING)
        fva_set_testing = set([tuple(x) for x in testing_dataset[['idb_path', 'fva']].values])
        self.assertEqual(len(fva_set_testing), N_SELECTED_FUNCS_TESTING)

        # Test the number of unique binaries
        idb_set_testing = set(testing_dataset['idb_path'].values)
        self.assertEqual(len(idb_set_testing), N_SELECTED_BINARIES_TESTING)

    def test_pairs_dataset_testing(self):
        for pair_name in os.listdir("pairs/testing/"):
            if not pair_name.endswith(".csv"):
                continue
            pair_path = os.path.join("pairs/testing/", pair_name)

            pairs_dataset = pd.read_csv(pair_path, index_col=0)

            # Test null values
            self.assertFalse(pairs_dataset.isnull().values.any())

            # Test pairs size            
            dt = dict(pairs_dataset.groupby("db_type").count()['idb_path_1'].items())
            if pair_name.startswith("pos_rank"):
                self.assertDictEqual(dt, {'XA': 200, 'XC': 200, 'XC+XB': 200, 'XM': 200})
            elif pair_name.startswith("neg_rank"):
                self.assertDictEqual(dt, {'XA': 20000, 'XC': 20000, 'XC+XB': 20000, 'XM': 20000})
            else:
                self.assertDictEqual(dt, {'XA': 50000, 'XC': 50000, 'XC+XB': 50000, 'XM': 50000})

            # Test overlapping functions
            p_fva_set = set([tuple(x) for x in pairs_dataset[['idb_path_1', 'fva_1']].values])
            p_fva_set |= set([tuple(x) for x in pairs_dataset[['idb_path_2', 'fva_2']].values])
            self.assertEqual(p_fva_set & self.fva_set_testing, p_fva_set)
    
    def test_pairs_dataset_validation(self):
        for pair_name in os.listdir("pairs/validation/"):
            if not pair_name.endswith(".csv"):
                continue
            pair_path = os.path.join("pairs/validation/", pair_name)

            pairs_dataset = pd.read_csv(pair_path, index_col=0)

            # Test null values
            self.assertFalse(pairs_dataset.isnull().values.any())

            # Test pairs size            
            dt = dict(pairs_dataset.groupby("db_type").count()['idb_path_1'].items())
            self.assertDictEqual(dt, {'XA': 10000, 'XC': 10000, 'XC+XB': 10000, 'XM': 10000})

            # Test overlapping functions
            p_fva_set = set([tuple(x) for x in pairs_dataset[['idb_path_1', 'fva_1']].values])
            p_fva_set |= set([tuple(x) for x in pairs_dataset[['idb_path_2', 'fva_2']].values])
            self.assertEqual(p_fva_set & self.fva_set_validation, p_fva_set)
    
    def test_selected(self):
        with open("features/training/selected_training_Dataset-1-CodeCMR.json") as f_in:
            selected = json.load(f_in)
            
            # Test overlapping functions
            entries_s = set([(k, hex(v)) for k in selected for v in selected[k]])
            self.assertEqual(entries_s & self.fva_set_training, entries_s | self.fva_set_training)
        
        with open("features/validation/selected_validation_Dataset-1-CodeCMR.json") as f_in:
            selected = json.load(f_in)
            
            # Test overlapping functions
            entries_s = set([(k, hex(v)) for k in selected for v in selected[k]])
            self.assertEqual(entries_s & self.fva_set_validation, entries_s | self.fva_set_validation)
        
        with open("features/testing/selected_testing_Dataset-1-CodeCMR.json") as f_in:
            selected = json.load(f_in)
            
            # Test overlapping functions
            entries_s = set([(k, hex(v)) for k in selected for v in selected[k]])
            self.assertEqual(entries_s & self.fva_set_testing, entries_s | self.fva_set_testing)
            
    def test_binary_files(self):
        a_list = [self.idb_set_training, self.idb_set_validation, self.idb_set_testing]
        b_list = [N_SELECTED_BINARIES_TRAINING, N_SELECTED_BINARIES_VALIDATION, N_SELECTED_BINARIES_TESTING]
        for bin_list, n_binaries in zip (a_list, b_list):
            binary_counter = 0
            for path in bin_list:
                npath = path.replace("IDBs/", "../../Binaries/")
                npath = npath.replace(".i64", "")
                if os.path.isfile(npath):
                    binary_counter += 1
            self.assertEqual(binary_counter, n_binaries)
            
    def test_pickle_files(self):
        a_list = [self.pickle_set_training, self.pickle_set_validation, self.pickle_set_testing]
        b_list = [N_SELECTED_BINARIES_TRAINING, N_SELECTED_BINARIES_VALIDATION, N_SELECTED_BINARIES_TESTING]
        
        for pickle_list, n_binaries in zip (a_list, b_list):
            pickle_counter = 0
            for pickle_path in pickle_list:
                if os.path.isfile(pickle_path):
                    pickle_counter += 1
            self.assertEqual(pickle_counter, n_binaries)
        
unittest.main(argv=[''], verbosity=2, exit=False)

test_binary_files (__main__.TestDataset1) ... ok
test_pairs_dataset_testing (__main__.TestDataset1) ... ok
test_pairs_dataset_validation (__main__.TestDataset1) ... ok
test_pickle_files (__main__.TestDataset1) ... ok
test_selected (__main__.TestDataset1) ... ok
test_testing_dataset (__main__.TestDataset1) ... ok
test_training_dataset (__main__.TestDataset1) ... ok
test_validation_dataset (__main__.TestDataset1) ... ok

----------------------------------------------------------------------
Ran 8 tests in 15.825s

OK


<unittest.main.TestProgram at 0x117ab95b0>