# Generating ILP Testing Set

Finds a group of small buffer overflow examples from the Juliet dataset, then generates their Prolog representations from the code property graph generated by Joern. These are saved to `../data/ilp_data.csv.gz`  and `../data/ilp_prolog_data.csv.gz` respectively. 


## Find a nice subset

Find a small number of short buffer overflow examples from the Juliet dataset, and save them into the `../data/ilp_test_data.csv.gz` dataframe. 

In [1]:
import pandas as pd

In [2]:
buffer_overflow_juliet = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [3]:
buffer_overflow_juliet['code_length'] = buffer_overflow_juliet.code.apply(len)
buffer_overflow_juliet

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,984,984,62516,000/062/516/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,112,False,5108
1,985,985,62517,000/062/517/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9668
2,986,986,62518,000/062/518/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9710
3,987,987,62519,000/062/519/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,123,False,10162
4,988,988,62520,000/062/520/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,123,False,10084
5,989,989,62521,000/062/521/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,122,False,10123
6,990,990,62522,000/062/522/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,122,False,10054
7,991,991,62523,000/062/523/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,130,False,10190
8,992,992,62524,000/062/524/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9897
9,993,993,62525,000/062/525/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9799


Pick next 5 smallest (20 smallest were chosen for our training):

In [4]:
buffer_overflow_juliet.groupby(['testcase_ID']).apply(
    lambda testcase, **kwargs: sum(testcase['code_length']),
    axis='columns',
).sort_values()

testcase_ID
-62852      1570
-62900      1576
-62869      1616
-62804      1618
-232012     1619
-62853      1621
-62917      1622
-232086     1625
-62901      1627
-62854      1630
-62867      1632
-62902      1636
-62915      1638
-62868      1643
-62861      1649
-62916      1649
-62909      1655
-62865      1657
-62913      1663
-62821      1664
-232029     1665
-231979     1667
-62805      1669
-62860      1670
-232103     1671
 62852      1674
-232013     1674
-62908      1676
-62862      1676
-62806      1678
           ...  
 232342    11388
 67724     11389
 232346    11425
 232341    11486
 232347    11501
 232196    11511
 67718     11512
 232343    11514
 232345    11523
 67716     11542
 70687     11546
 67717     11581
 67759     11584
 67715     11620
 232339    11643
 67719     11648
 232337    11673
 232364    11690
 62727     11694
 232338    11712
 232336    11751
 232340    11779
 232184    11843
 232352    12397
 67552     12449
 67744     12616
 62548     13047
 6

In [4]:
ilp_test_data= buffer_overflow_juliet[
    (buffer_overflow_juliet['testcase_ID'] == 232029) | 
    (buffer_overflow_juliet['testcase_ID'] == 231979) | 
    (buffer_overflow_juliet['testcase_ID'] == 62805) | 
    (buffer_overflow_juliet['testcase_ID'] == 62860) | 
    (buffer_overflow_juliet['testcase_ID'] == 232103) | 
    (buffer_overflow_juliet['testcase_ID'] == -232029) | 
    (buffer_overflow_juliet['testcase_ID'] == -231979) | 
    (buffer_overflow_juliet['testcase_ID'] == -62805) | 
    (buffer_overflow_juliet['testcase_ID'] == -62860) | 
    (buffer_overflow_juliet['testcase_ID'] == -232103)
]

ilp_test_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
517,1501,1501,62805,000/062/805/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,36,False,2502
589,1573,1573,62860,000/062/860/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2495
15137,80988,80988,231979,000/231/979/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,34,False,1771
15196,81047,81047,232029,000/232/029/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,False,1809
15298,81149,81149,232103,000/232/103/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,False,1815
18667,106684,1501,-62805,000/062/805/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,36,True,1669
18739,106756,1573,-62860,000/062/860/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1670
33287,186171,80988,-231979,000/231/979/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,34,True,1667
33346,186230,81047,-232029,000/232/029/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,True,1665
33448,186332,81149,-232103,000/232/103/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,True,1671


In [7]:
ilp_test_data = ilp_test_data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis='columns')

In [8]:
ilp_test_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
517,62805,000/062/805/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,36,False,2502
589,62860,000/062/860/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2495
15137,231979,000/231/979/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,34,False,1771
15196,232029,000/232/029/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,False,1809
15298,232103,000/232/103/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,False,1815
18667,-62805,000/062/805/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,36,True,1669
18739,-62860,000/062/860/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1670
33287,-231979,000/231/979/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,34,True,1667
33346,-232029,000/232/029/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,True,1665
33448,-232103,000/232/103/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,33,True,1671


In [27]:
ilp_test_data.to_csv("../data/ilp_test_data.csv.gz")

## Generating Prolog Representations

Now that we have a set of examples, we want to generate a Prolog representation. To do this we use Joern to derive a code property graph from each source file. Then, using our `../code/ILP-joern-cfg-to-prolog.scala` script, we convert a subset of this graph into a set of prolog facts. 

In [9]:
import os
import subprocess
import tempfile

In [10]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [13]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/ILP-joern-cfg-to-prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [14]:
ilp_test_data.groupby('testcase_ID').apply(generate_prolog)

In [15]:
prolog_test = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog_test = prolog_test[1:11]  # when we run apply it duplicates the first group
prolog_test

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232103,CWE-122,True,1671,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-232029,CWE-122,True,1665,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-231979,CWE-122,True,1667,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62860,CWE-121,True,1670,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62805,CWE-121,True,1669,% START: Generated Prolog\n% NODE PROPERTIES \...
6,62805,CWE-121,False,2502,% START: Generated Prolog\n% NODE PROPERTIES \...
7,62860,CWE-121,False,2495,% START: Generated Prolog\n% NODE PROPERTIES \...
8,231979,CWE-122,False,1771,% START: Generated Prolog\n% NODE PROPERTIES \...
9,232029,CWE-122,False,1809,% START: Generated Prolog\n% NODE PROPERTIES \...
10,232103,CWE-122,False,1815,% START: Generated Prolog\n% NODE PROPERTIES \...


In [16]:
print(prolog_test.tree.iloc[0])

% START: Generated Prolog
% NODE PROPERTIES 
alloc(id_57).
sizeOf(id_66).
writeToPointer(id_69).
compMemberAccess(id_76).
assignment(id_81).
compMemberAccess(id_133).
compMemberAccess(id_134).
sizeOf(id_138).
sizeOf(id_139).
writeToPointer(id_145).
writeToPointer(id_146).
assignment(id_148).
alloc(id_163).
alloc(id_164).
assignment(id_168).
assignment(id_173).
assignment(id_174).
% METHOD 
pointer(id_7).
voidPointer(id_121).
pointer(id_123).
pointer(id_128).
pointer(id_132).
sizeOfInt(id_143).
array10(id_143).
pointer(id_144).
sizeOfInt(id_147).
array10(id_147).
sizeOfInt(id_149).
array10(id_149).
voidPointer(id_150).
voidPointer(id_156).
pointer(id_158).
pointer(id_167).
pointer(id_172).
pointer(id_175).
voidPointer(id_176).
voidPointer(id_177).
% CODE
source_code(id_1, "p2").
source_code(id_2, "p1").
source_code(id_3, "p2").
source_code(id_4, "p1").
source_code(id_5, "p1").
source_code(id_6, "p1").
source_code(id_7, "char * argv[]").
source_code(id_8, "int argc").
source_code(id_9, "

In [17]:
import re

In [18]:
def fix_single_rules(testcase):
    find_node_ids = re.compile('\((\w+)\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1).'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_tree_rules(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2).'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_code_rules(testcase):
    find_node_ids = re.compile('\((\w+), "(.*)"\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, "\\2").'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [19]:
prolog_test['tree'] = prolog_test.apply(fix_single_rules, axis='columns')
prolog_test['tree'] = prolog_test.apply(fix_tree_rules, axis='columns')
prolog_test['tree'] = prolog_test.apply(fix_code_rules, axis='columns')

In [20]:
prolog_test

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232103,CWE-122,True,1671,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-232029,CWE-122,True,1665,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-231979,CWE-122,True,1667,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62860,CWE-121,True,1670,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62805,CWE-121,True,1669,% START: Generated Prolog\n% NODE PROPERTIES \...
6,62805,CWE-121,False,2502,% START: Generated Prolog\n% NODE PROPERTIES \...
7,62860,CWE-121,False,2495,% START: Generated Prolog\n% NODE PROPERTIES \...
8,231979,CWE-122,False,1771,% START: Generated Prolog\n% NODE PROPERTIES \...
9,232029,CWE-122,False,1809,% START: Generated Prolog\n% NODE PROPERTIES \...
10,232103,CWE-122,False,1815,% START: Generated Prolog\n% NODE PROPERTIES \...


In [21]:
def extract_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)
            
    
def remove_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if not in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)

In [23]:
prolog_test['source_map'] = prolog_test['tree'].apply(extract_source_map)
prolog_test['tree'] = prolog_test['tree'].apply(remove_source_map)

In [25]:
prolog_test.to_csv("../data/ilp_prolog_test_data.csv.gz")