In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-Vulnerability sanity check                                        #
#                                                                            #
##############################################################################

In [2]:
## Requirements
# pandas==1.4.2

In [3]:
import json
import os
import pandas as pd
import unittest

In [4]:
N_SELECTED_FUNCS = 3485
N_SELECTED_BINARIES = 6
N_SELECTED_PAIRS = 88700

In [5]:
class TestDatasetVulnerability(unittest.TestCase):
    
    def setUp(self):
        testing_dataset = pd.read_csv("testing_Dataset-Vulnerability.csv", index_col=0)
        self.fva_set = set([tuple(x) for x in testing_dataset[['idb_path', 'fva']].values])
        self.idb_set = set(testing_dataset['idb_path'].values)

    def test_testing_dataset(self):
        testing_dataset = pd.read_csv("testing_Dataset-Vulnerability.csv", index_col=0)

        # Test null values
        self.assertFalse(testing_dataset.isnull().values.any())

        # Test number of unique functions
        self.assertEqual(testing_dataset.shape[0], N_SELECTED_FUNCS)
        self.assertEqual(len(self.fva_set), N_SELECTED_FUNCS)

        # Test the number of unique binaries
        self.assertEqual(len(self.idb_set), N_SELECTED_BINARIES)
        
    def test_flowchart_dataset(self):
        flowchart_dataset = pd.read_csv("features/flowchart_Dataset-Vulnerability.csv")

        # Test null values
        self.assertFalse(flowchart_dataset.isnull().values.any())

        # Functions in testing_dataset must a subset of those in flowchart
        fc_fva_set = set([tuple(x) for x in flowchart_dataset[['idb_path', 'fva']].values])
        self.assertEqual(self.fva_set & fc_fva_set, self.fva_set)
                
    def test_pairs_dataset(self):
        pairs_dataset = pd.read_csv("pairs/pairs_testing_Dataset-Vulnerability.csv", index_col=0)
        
        # Test null values
        self.assertFalse(pairs_dataset.isnull().values.any())

        # Test pairs size
        t, c = list(pairs_dataset.groupby("db_type").count()['idb_path_1'].items())[0]
        self.assertEqual(t, "XM")
        self.assertEqual(c, N_SELECTED_PAIRS)

        # Test overlapping functions
        p_fva_set = set([tuple(x) for x in pairs_dataset[['idb_path_1', 'fva_1']].values])
        p_fva_set |= set([tuple(x) for x in pairs_dataset[['idb_path_2', 'fva_2']].values])
        self.assertEqual(p_fva_set & self.fva_set, p_fva_set | self.fva_set)
        
    def test_selected(self):
        with open("features/selected_Dataset-Vulnerability.json") as f_in:
            selected = json.load(f_in)
            
            # Test overlapping functions
            entries_s = set([(k, hex(v)) for k in selected for v in selected[k]])
            self.assertEqual(entries_s & self.fva_set, entries_s | self.fva_set)
            
    def test_binary_files(self):
        binary_counter = 0
        for path in self.idb_set:
            npath = path.replace("IDBs/", "../../Binaries/")
            npath = npath.replace(".i64", "")
            if os.path.isfile(npath):
                binary_counter += 1
        self.assertEqual(binary_counter, N_SELECTED_BINARIES)
        
    def test_acfg_disasm(self):
        j_counter = 0
        for path in self.idb_set:
            path = os.path.join(
                "features/acfg_disasm_Dataset-Vulnerability",
                os.path.basename(path).replace(".i64", "_acfg_disasm.json"))
            if os.path.isfile(path):
                j_counter += 1
        self.assertEqual(j_counter, N_SELECTED_BINARIES)
    
    def test_acfg_features(self):
        j_counter = 0
        for path in self.idb_set:
            path = os.path.join(
                "features/acfg_features_Dataset-Vulnerability",
                os.path.basename(path).replace(".i64", "_acfg_features.json"))
            if os.path.isfile(path):
                j_counter += 1
        self.assertEqual(j_counter, N_SELECTED_BINARIES)
    
    def test_catalog1(self):
        j_counter = 0
        sigs = [16, 32, 64, 128]
        for x in sigs:
            if os.path.isfile(
                os.path.join(
                    "features/catalog1_Dataset-Vulnerability/",
                    "Dataset-Vulnerability_catalog1_{}.csv".format(x))):
                j_counter += 1
        self.assertEqual(j_counter, len(sigs))

    def test_fss(self):
        j_counter = 0
        for path in self.idb_set:
            path = os.path.join(
                "features/fss_Dataset-Vulnerability",
                os.path.basename(path).replace(".i64", "_Capstone_True_fss.json"))
            if os.path.isfile(path):
                j_counter += 1
        self.assertEqual(j_counter, N_SELECTED_BINARIES)

unittest.main(argv=[''], verbosity=2, exit=False)

test_acfg_disasm (__main__.TestDatasetVulnerability) ... ok
test_acfg_features (__main__.TestDatasetVulnerability) ... ok
test_binary_files (__main__.TestDatasetVulnerability) ... ok
test_catalog1 (__main__.TestDatasetVulnerability) ... ok
test_flowchart_dataset (__main__.TestDatasetVulnerability) ... ok
test_fss (__main__.TestDatasetVulnerability) ... ok
test_pairs_dataset (__main__.TestDatasetVulnerability) ... ok
test_selected (__main__.TestDatasetVulnerability) ... ok
test_testing_dataset (__main__.TestDatasetVulnerability) ... ok

----------------------------------------------------------------------
Ran 9 tests in 0.375s

OK


<unittest.main.TestProgram at 0x103a24b50>