In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-Vulnerability creation                                            #
#                                                                            #
##############################################################################

In [2]:
## Requirements
# tqdm==4.64.0
# pandas==1.4.2

In [3]:
import json
import pandas as pd

from tqdm import tqdm

**Read the flowchart CSV**

In [4]:
flowchart = pd.read_csv("features/flowchart_Dataset-Vulnerability.csv")
print(flowchart.shape)

(11274, 8)


In [5]:
flowchart.head()

Unnamed: 0,idb_path,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes
0,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x3d908,.init_proc,0x3d908,0x3d98c,7,0x3d908;0x3d92c;0x3d938;0x3d93c;0x3d940;0x3d96...,db718c1f7f7e69f453df9a31a7f691b6197293b5d53d82...
1,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x3d990,_ftext,0x3d990,0x3da84,8,0x3d990;0x3d9ec;0x3d9f4;0x3da08;0x3da10;0x3da2...,bdf4e44bd33566ff51e8d779c84a93d59722d5c5876ffd...
2,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x3db14,__do_global_dtors_aux,0x3db14,0x3dbf4,8,0x3db14;0x3db4c;0x3db54;0x3db64;0x3db94;0x3dbc...,710f717269efae65b5ea975beb7209188dfcfc802da7a3...
3,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x3dc10,CRYPTO_get_new_lockid,0x3dc10,0x3dd18,9,0x3dc10;0x3dc40;0x3dc58;0x3dc70;0x3dc88;0x3dcb...,7039527b97befa4ba3a820b3a549664c71da4f938adb5c...
4,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x3dea8,CRYPTO_THREADID_current,0x3dea8,0x3df34,6,0x3dea8;0x3ded4;0x3dee0;0x3def0;0x3df04;0x3df18,eca1910e54cd579a25cdc20b1e8952bb465ddecc46cb4c...


In [6]:
flowchart.groupby(['idb_path']).count()

Unnamed: 0_level_0,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes
idb_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_NETGEAR_R7000_1.0.2h_arm32.i64,1766,1766,1766,1766,1766,1766,1766
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_TP-Link_Deco-M4_1.0.2d_mips32.i64,1679,1679,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,1886,1886,1886,1886,1886,1886,1886
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_mips32.i64,1952,1952,1952,1952,1952,1952,1952
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x64.i64,2062,2062,2062,2062,2062,2062,2062
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x86.i64,1929,1929,1929,1929,1929,1929,1929


**Create pairs of functions for the vulnerability test.**

In [7]:
vulnerable_functions_netgear = [
    'BN_bn2dec',                # CVE-2016-2182
    'CMS_decrypt',              # CVE-2019-1563
    'MDC2_Update',              # CVE-2016-6303
    'PKCS7_dataDecode',         # CVE-2019-1563
]

In [8]:
vulnerable_functions_tplink = [
    'BN_bn2dec',                # CVE-2016-2182
    'BN_dec2bn',                # CVE-2016-0797
    'BN_hex2bn',                # CVE-2016-0797
    'CMS_decrypt',              # CVE-2019-1563
    'EVP_EncodeUpdate',         # CVE-2016-2105
    'EVP_EncryptUpdate',        # CVE-2016-2106
    'PKCS7_dataDecode',         # CVE-2019-1563
    'SRP_VBASE_get_by_user',    # CVE-2016-0798
    'X509_NAME_oneline',        # CVE-2016-2176
]

In [9]:
vuln_dict = {
    'NETGEAR_R7000': {
        'functions': vulnerable_functions_netgear,
        'idb_path': 'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_NETGEAR_R7000_1.0.2h_arm32.i64'
    },
    'TP-Link_Deco-M4': {
        'functions': vulnerable_functions_tplink,
        'idb_path': 'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_TP-Link_Deco-M4_1.0.2d_mips32.i64'
    }
}

In [10]:
source_idb_path_list = [
    'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64',
    'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_mips32.i64',
    'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x64.i64',
    'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x86.i64'
]

In [11]:
selected_columns = ['idb_path', 'fva', 'func_name', 'hashopcodes']

# Store the new function pairs
comparison_list = list()

# Iterate over each target of the vulnerability test
for target_fw in vuln_dict.keys():
    
    # Iterate over each function in the list
    for source_func in tqdm(vuln_dict[target_fw]['functions']):
        
        # Iterate over the selected openSSL binaries
        for source_path in source_idb_path_list:
            
            # Select the source function
            left_row = flowchart[(flowchart['idb_path'] == source_path) & (flowchart['func_name'] == source_func)]
            left = list(left_row[selected_columns].values[0])
            
            # Iterate over the target functions
            right_indexes = flowchart[flowchart['idb_path'] == vuln_dict[target_fw]['idb_path']].index
            for index in right_indexes:
                right = list(flowchart.loc[index,selected_columns].values)
                comparison_list.append(left+right)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.09s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:09<00:00,  1.05s/it]


In [12]:
comparison_list[:2]

[['IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64',
  '0x77224',
  'BN_bn2dec',
  '5c0dcea8ff350f2ff9753866220adb248f225dfddf193fd7bfd9a104a940bc57',
  'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_NETGEAR_R7000_1.0.2h_arm32.i64',
  '0x3e82c',
  'start',
  'f7adc553e38316b76e058693ac8b6abee2f3a2988c2e0ffe9903f2678ba96ce8'],
 ['IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64',
  '0x77224',
  'BN_bn2dec',
  '5c0dcea8ff350f2ff9753866220adb248f225dfddf193fd7bfd9a104a940bc57',
  'IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_NETGEAR_R7000_1.0.2h_arm32.i64',
  '0x3e8a8',
  'sub_3E8A8',
  'e7e26db952ab6c95744f957025825d25cace4bcf53da40b7abcdc4756f89ca18']]

In [13]:
# Create a new DataFrame
columns = [x + "_1" for x in selected_columns ] + [x + "_2" for x in selected_columns ]
testing = pd.DataFrame(comparison_list, columns=columns)

# Add the db_type column 
testing['db_type'] = ['XM'] * testing.shape[0]
print(testing.shape)

(88700, 9)


In [14]:
# Sort the rows
testing.sort_values(by=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2'], inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(88700, 9)


In [15]:
# Check that the hashopcodes of the functions to compare are different
for i, row in testing.iterrows():
    if row['hashopcodes_1'] == row['hashopcodes_2']:
        print("MATCH!")
        print(row)

In [16]:
# Paranoid check
testing.drop_duplicates(inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(88700, 9)


In [17]:
# Remove hashopcodes columns
del testing['hashopcodes_1']
del testing['hashopcodes_2']

In [18]:
testing.head()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
0,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x11d874,PKCS7_dataDecode,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x10018c,X509_load_cert_crl_file,XM
1,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x11d874,PKCS7_dataDecode,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x1002dc,sub_1002DC,XM
2,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x11d874,PKCS7_dataDecode,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x1003e0,sub_1003E0,XM
3,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x11d874,PKCS7_dataDecode,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x100454,sub_100454,XM
4,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x11d874,PKCS7_dataDecode,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0x1009a0,sub_1009A0,XM


In [19]:
testing.tail()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
88695,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xc5090,EVP_EncryptUpdate,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xffca4,TS_TST_INFO_set_msg_imprint,XM
88696,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xc5090,EVP_EncryptUpdate,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xffd50,TS_TST_INFO_set_serial,XM
88697,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xc5090,EVP_EncryptUpdate,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xffdfc,TS_TST_INFO_set_time,XM
88698,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xc5090,EVP_EncryptUpdate,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xffea8,TS_TST_INFO_set_accuracy,XM
88699,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xc5090,EVP_EncryptUpdate,IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_...,0xfff54,TS_ACCURACY_set_seconds,XM


In [20]:
testing.groupby(['idb_path_1', 'func_name_1']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,fva_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,func_name_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,BN_bn2dec,3445,3445,3445,3445,3445
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,BN_dec2bn,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,BN_hex2bn,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,CMS_decrypt,3445,3445,3445,3445,3445
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,EVP_EncodeUpdate,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,EVP_EncryptUpdate,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,MDC2_Update,1766,1766,1766,1766,1766
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,PKCS7_dataDecode,3445,3445,3445,3445,3445
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,SRP_VBASE_get_by_user,1679,1679,1679,1679,1679
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,X509_NAME_oneline,1679,1679,1679,1679,1679


In [21]:
testing.groupby(['idb_path_1']).count()

Unnamed: 0_level_0,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_arm32.i64,22175,22175,22175,22175,22175,22175
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_mips32.i64,22175,22175,22175,22175,22175,22175
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x64.i64,22175,22175,22175,22175,22175,22175
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_openssl_1.0.2d_x86.i64,22175,22175,22175,22175,22175,22175


In [22]:
testing.groupby(['idb_path_2']).count()

Unnamed: 0_level_0,idb_path_1,fva_1,func_name_1,fva_2,func_name_2,db_type
idb_path_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_NETGEAR_R7000_1.0.2h_arm32.i64,28256,28256,28256,28256,28256,28256
IDBs/Dataset-Vulnerability/libcrypto.so.1.0.0_TP-Link_Deco-M4_1.0.2d_mips32.i64,60444,60444,60444,60444,60444,60444


In [23]:
# Save the DataFrame to file
testing.to_csv("pairs/pairs_testing_Dataset-Vulnerability.csv")

In [24]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.

testing_functions = set([tuple(x) for x in testing[['idb_path_1', 'fva_1']].values])
testing_functions |= set([tuple(x) for x in testing[['idb_path_2', 'fva_2']].values])
print("Found {} unique functions".format(len(testing_functions)))

from collections import defaultdict
selected_functions = defaultdict(list)
for t in testing_functions:
    selected_functions[t[0]].append(int(t[1], 16))

# Test
assert(sum([len(v) for v in selected_functions.values()]) == len(testing_functions))

# Save to file
with open("features/selected_Dataset-Vulnerability.json", "w") as f_out:
    json.dump(selected_functions, f_out)

Found 3485 unique functions


In [25]:
# Save the "selected functions" to a CSV.
# This will be useful to post-process the results.

# Remove from flowchart the functions that are not used for the testing
dataset = flowchart.copy()
del dataset['bb_list']
del_list = list()
for i, row in dataset.iterrows():
    if not tuple([row['idb_path'], row['fva']]) in testing_functions:
        del_list.append(i)
dataset.drop(del_list, inplace=True)
dataset.reset_index(inplace=True, drop=True)
print(dataset.shape)

# Save to file
dataset.to_csv("testing_Dataset-Vulnerability.csv")

(3485, 7)
