<a href="https://colab.research.google.com/github/Ebenx007/compchem-Compsci-shared-rep/blob/main/2_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
from pathlib import Path
import sys
import os
import shutil
import tarfile
import zipfile
import subprocess
import pickle
import re
import glob

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/



[**1.Data-Acquisition is here**](https://github.com/Ebenx007/compchem-Compsci-shared-rep/blob/main/1_data_acquisition.ipynb)


# 2.   PROCESS & CURATE DATA FOR USE IN MODELS

---



*   GENERATE GRAPHS FROM COMPILED CODE & BINARIES.
---
---


 >  * 
1.   Use pycparser to extract ASTs from C code and serialize C ASTs graphs 
2.   Serialize to Json files
3.   Use Angr to extract CFG from C binaries
4.   Archive processed data for use in Vector generation for ML models 



    Store Json graphs in Googledrive for use in generating vectors for the ML Models.


**3.1 Using pycparser and Json serializing script to generate serialized ASTs from compilable Competition C code submissions** 
----

> *   pycparser and Json serializing script for single file C programs submitted in programming competition 

In [None]:
%%writefile c_json.py

# Original script from:
#-----------------------------------------------------------------
# pycparser: serialize_ast.py
#
# Simple example of serializing AST
#
# Hart Chu [https://github.com/CtheSky]
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
## Adjusted for use here by Valentine Eben.

import json
import sys
import re



from pycparser import parse_file, c_ast
from pycparser.plyparser import Coord


RE_CHILD_ARRAY = re.compile(r'(.*)\[(.*)\]')
RE_INTERNAL_ATTR = re.compile('__.*__')


class CJsonError(Exception):
    pass


def memodict(fn):
    """ Fast memoization decorator for a function taking a single argument """
    class memodict(dict):
        def __missing__(self, key):
            ret = self[key] = fn(key)
            return ret
    return memodict().__getitem__


@memodict
def child_attrs_of(klass):
    """
    Given a Node class, get a set of child attrs.
    Memoized to avoid highly repetitive string manipulation
    """
    non_child_attrs = set(klass.attr_names)
    all_attrs = set([i for i in klass.__slots__ if not RE_INTERNAL_ATTR.match(i)])
    return all_attrs - non_child_attrs


def to_dict(node):
    """ Recursively convert an ast into dict representation. """
    klass = node.__class__

    result = {}

    # Metadata
    result['_nodetype'] = klass.__name__

    # Local node attributes
    for attr in klass.attr_names:
        result[attr] = getattr(node, attr)

    # Coord object
    if node.coord:
        result['coord'] = str(node.coord)
    else:
        result['coord'] = None

    # Child attributes
    for child_name, child in node.children():
        # Child strings are either simple (e.g. 'value') or arrays (e.g. 'block_items[1]')
        match = RE_CHILD_ARRAY.match(child_name)
        if match:
            array_name, array_index = match.groups()
            array_index = int(array_index)
            # arrays come in order, so we verify and append.
            result[array_name] = result.get(array_name, [])
            if array_index != len(result[array_name]):
                raise CJsonError('Internal ast error. Array {} out of order. '
                    'Expected index {}, got {}'.format(
                    array_name, len(result[array_name]), array_index))
            result[array_name].append(to_dict(child))
        else:
            result[child_name] = to_dict(child)

    # Any child attributes that were missing need "None" values in the json.
    for child_attr in child_attrs_of(klass):
        if child_attr not in result:
            result[child_attr] = None

    return result


def to_json(node, **kwargs):
    """ Convert ast node to json string """
    return json.dumps(to_dict(node), **kwargs)


def file_to_dict(filename):
    """ Load C file into dict representation of ast """
    ## ast = parse_file(filename, use_cpp=True) my addition of pycparser fakehead to allow for intraction with files
    ast = parse_file(filename, use_cpp=True,
            cpp_path='gcc',
            cpp_args=['-E', r'-Iutils/fake_libc_include'])
    return to_dict(ast)


def file_to_json(filename, **kwargs):
    """ Load C file into json string representation of ast """
     ## ast = parse_file(filename, use_cpp=True) my addition of pycparser fakehead to allow for intraction with files
    ast = parse_file(filename, use_cpp=True,
            cpp_path='gcc',
            cpp_args=['-E', r'-Iutils/fake_libc_include'])
    return to_json(ast, **kwargs)


def _parse_coord(coord_str):
    """ Parse coord string (file:line[:column]) into Coord object. """
    if coord_str is None:
        return None

    vals = coord_str.split(':')
    vals.extend([None] * 3)
    filename, line, column = vals[:3]
    return Coord(filename, line, column)


def _convert_to_obj(value):
    """
    Convert an object in the dict representation into an object.
    Note: Mutually recursive with from_dict.
    """
    value_type = type(value)
    if value_type == dict:
        return from_dict(value)
    elif value_type == list:
        return [_convert_to_obj(item) for item in value]
    else:
        # String
        return value


def from_dict(node_dict):
    """ Recursively build an ast from dict representation """
    class_name = node_dict.pop('_nodetype')

    klass = getattr(c_ast, class_name)

    # Create a new dict containing the key-value pairs which we can pass
    # to node constructors.
    objs = {}
    for key, value in node_dict.items():
        if key == 'coord':
            objs[key] = _parse_coord(value)
        else:
            objs[key] = _convert_to_obj(value)

    # Use keyword parameters, which works thanks to beautifully consistent
    # ast Node initializers.
    return klass(**objs)


def from_json(ast_json):
    """ Build an ast from json string representation """
    return from_dict(json.loads(ast_json))


#------------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        # Some test code...
        # Do trip from C -> ast -> dict -> ast -> json, then print.
        ast_dict = file_to_dict(sys.argv[1])
        ast = from_dict(ast_dict)
        json_object = to_json(ast, sort_keys=True, indent=4)
        print(json_object)
        with open(sys.argv[1][:-2]+ ".json", "w") as outfile:
          outfile.write(json_object)
        ## little adjustment to save to a file names almost as input file. The minus 2 for ".c"
    else:
        print("Please provide a filename as argument")

Writing c_json.py


In [None]:
!ls

C			       juliet_dataset_CWE_testcases_paths_ls_file
c_compiled_ls_file	       juliet_dataset_ls_file
c_json.py		       Juliet_Test_Suite_v1.3_for_C_Cpp.zip
cpp_compiled_ls_file	       ProgramData
decodable_submisisons_ls_file  programs.tar.gz
drive			       sample_data
juliet_dataset_CWE_ls_file     submisisons_ls_file


In [None]:
#Test AST generating script 
!python3 c_json.py /content/ProgramData/1/1076.c

In [None]:
#Check that json ast (1076.json) was created in addtion to the stdout priprint 
!ls ./ProgramData/1/

In [None]:
!ls

C			       juliet_dataset_CWE_testcases_paths_ls_file
c_compiled_ls_file	       juliet_dataset_ls_file
c_json.py		       Juliet_Test_Suite_v1.3_for_C_Cpp.zip
cpp_compiled_ls_file	       ProgramData
decodable_submisisons_ls_file  programs.tar.gz
drive			       sample_data
juliet_dataset_CWE_ls_file     submisisons_ls_file


In [None]:
#Generate the rest of the json ASTs from the Programming Competition submissions 
for i in range(len(compiled_c_code)):
    subprocess.run(["python3", "c_json.py", compiled_c_code[i]])
print('Done generating json ASTs. Verify with ls cmd')  

Done generating json ASTs.  Use list files to verify success


In [None]:
#Verify success of json generating script
programming_competition_c_ast_jsons = []
paths = Path('./ProgramData').glob('**/*.json')
for path in paths:
  programming_competition_c_ast_jsons.append(str(path))
  # str because path is an object not string
print("Generated {} jsons of C ASTs from the programming submissions".format(len(programming_competition_c_ast_jsons)))
print(programming_competition_c_ast_jsons)
print('\n')

In [None]:
#check for json ASTs in  ProgramData subdirectories e.g. ProgramData/1/
!ls ./ProgramData/1/ 

In [None]:
with open('programming_competition_c_ast_jsons_ls_file', 'wb') as fp:
  pickle.dump (programming_competition_c_ast_jsons, fp)

**3.2 Using pycparser and Json serializing script to generate serialized ASTs from juliet C code**
---- 

In [None]:
##################### copy partially processed files for further processing ########### 
!cp "/content/drive/My Drive/2020/Fall2020/individual_lab_members/ebenx007/phase1_processed_juliet_dataset.zip" . 

In [None]:
!ls

C  drive  phase1_processed_juliet_dataset.zip  sample_data


In [None]:
with zipfile.ZipFile('phase1_processed_juliet_dataset.zip', 'r') as p_juliet_dataset:
   p_juliet_dataset.extractall()
   print('Done Extracting processed juliet dataset')

Done Extracting processed juliet dataset


In [None]:
!ls

C  drive  phase1_processed_juliet_dataset.zip  sample_data


In [None]:
#cd cmd to directoy where files where saved before archived
%cd C

/content/C


In [None]:
#loading pickled data back CWE_makefile_dir
CWE_makefile_dir = []
with open("CWE_makefile_dir_ls_file", "rb") as fld:
  CWE_makefile_dir = pickle.load(fld)

In [None]:
#change back to home dir
%cd ..

/content


In [None]:
print(len(CWE_makefile_dir))
print(CWE_makefile_dir)

> *   pycparser and Json serializing script for multi file programs 

In [None]:
#Start by cloneing pycparser to ease access to pycparser's Fakeheaders, needed for AST generation, instead of trying to access those that come with the pycpaser package. Eli's idea and it works  
!git clone  https://github.com/eliben/pycparser.git 


Cloning into 'pycparser'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 2548 (delta 25), reused 38 (delta 13), pack-reused 2489[K
Receiving objects: 100% (2548/2548), 1.15 MiB | 2.90 MiB/s, done.
Resolving deltas: 100% (1702/1702), done.


In [None]:
%%writefile multi_file_c_json.py

# Original script from:
#-----------------------------------------------------------------
# pycparser: serialize_ast.py
#
# Simple example of serializing AST
#
# Hart Chu [https://github.com/CtheSky]
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
## Adjusted for use here by Valentine Eben notice the addtion of the headers for this specific juliet dataset as '-Icontent/C'.

import json
import sys
import re


from pycparser import parse_file, c_ast
from pycparser.plyparser import Coord


RE_CHILD_ARRAY = re.compile(r'(.*)\[(.*)\]')
RE_INTERNAL_ATTR = re.compile('__.*__')


class CJsonError(Exception):
    pass


def memodict(fn):
    """ Fast memoization decorator for a function taking a single argument """
    class memodict(dict):
        def __missing__(self, key):
            ret = self[key] = fn(key)
            return ret
    return memodict().__getitem__


@memodict
def child_attrs_of(klass):
    """
    Given a Node class, get a set of child attrs.
    Memoized to avoid highly repetitive string manipulation
    """
    non_child_attrs = set(klass.attr_names)
    all_attrs = set([i for i in klass.__slots__ if not RE_INTERNAL_ATTR.match(i)])
    return all_attrs - non_child_attrs


def to_dict(node):
    """ Recursively convert an ast into dict representation. """
    klass = node.__class__

    result = {}

    # Metadata
    result['_nodetype'] = klass.__name__

    # Local node attributes
    for attr in klass.attr_names:
        result[attr] = getattr(node, attr)

    # Coord object
    if node.coord:
        result['coord'] = str(node.coord)
    else:
        result['coord'] = None

    # Child attributes
    for child_name, child in node.children():
        # Child strings are either simple (e.g. 'value') or arrays (e.g. 'block_items[1]')
        match = RE_CHILD_ARRAY.match(child_name)
        if match:
            array_name, array_index = match.groups()
            array_index = int(array_index)
            # arrays come in order, so we verify and append.
            result[array_name] = result.get(array_name, [])
            if array_index != len(result[array_name]):
                raise CJsonError('Internal ast error. Array {} out of order. '
                    'Expected index {}, got {}'.format(
                    array_name, len(result[array_name]), array_index))
            result[array_name].append(to_dict(child))
        else:
            result[child_name] = to_dict(child)

    # Any child attributes that were missing need "None" values in the json.
    for child_attr in child_attrs_of(klass):
        if child_attr not in result:
            result[child_attr] = None

    return result


def to_json(node, **kwargs):
    """ Convert ast node to json string """
    return json.dumps(to_dict(node), **kwargs)


def file_to_dict(filename):
    """ Load C file into dict representation of ast """
     ## ast = parse_file(filename, use_cpp=True) my addition of pycparser fakehead to allow for intraction with files.
    ast = parse_file(filename, use_cpp=True,
            cpp_path='gcc',
            cpp_args=['-nostdinc','-E','-IC/testcasesupport/', r'-Ipycparser/utils/fake_libc_include'])
    ## also -nostdinc to supress hard coded system headers creating issues and adding the std for this project 
    return to_dict(ast)


def file_to_json(filename, **kwargs):
    """ Load C file into json string representation of ast """
     ## ast = parse_file(filename, use_cpp=True) my addition of pycparser fakehead to allow for intraction with files.
    ast = parse_file(filename, use_cpp=True,
            cpp_path='gcc',
            cpp_args=['-nostdinc','-E','-IC/testcasesupport/', r'-Ipycparser/utils/fake_libc_include'])
   ## also -nostdinc to supress hard coded system headers creating issues
    return to_json(ast, **kwargs)


def _parse_coord(coord_str):
    """ Parse coord string (file:line[:column]) into Coord object. """
    if coord_str is None:
        return None

    vals = coord_str.split(':')
    vals.extend([None] * 3)
    filename, line, column = vals[:3]
    return Coord(filename, line, column)


def _convert_to_obj(value):
    """
    Convert an object in the dict representation into an object.
    Note: Mutually recursive with from_dict.
    """
    value_type = type(value)
    if value_type == dict:
        return from_dict(value)
    elif value_type == list:
        return [_convert_to_obj(item) for item in value]
    else:
        # String
        return value


def from_dict(node_dict):
    """ Recursively build an ast from dict representation """
    class_name = node_dict.pop('_nodetype')

    klass = getattr(c_ast, class_name)

    # Create a new dict containing the key-value pairs which we can pass
    # to node constructors.
    objs = {}
    for key, value in node_dict.items():
        if key == 'coord':
            objs[key] = _parse_coord(value)
        else:
            objs[key] = _convert_to_obj(value)

    # Use keyword parameters, which works thanks to beautifully consistent
    # ast Node initializers.
    return klass(**objs)


def from_json(ast_json):
    """ Build an ast from json string representation """
    return from_dict(json.loads(ast_json))


#------------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        # Some test code...
        # Do trip from C -> ast -> dict -> ast -> json, then print.
        ast_dict = file_to_dict(sys.argv[1])
        ast = from_dict(ast_dict)
        json_object = to_json(ast, sort_keys=True, indent=4)
        print(json_object)
        with open(sys.argv[1][:-2]+ ".json", "w") as outfile:
          outfile.write(json_object)
        ## little adjustment to save to a file names almost as input file. The minus 2 for ".c"
    else:
        print("Please provide a filename as argument")


Writing multi_file_c_json.py


In [None]:
#check script was created
!ls

C      multi_file_c_json.py		    pycparser
drive  phase1_processed_juliet_dataset.zip  sample_data


In [None]:
#Test multiple file C program Json Genrating script
!python3 multi_file_c_json.py C/testcases/CWE675_Duplicate_Operations_on_Resource/CWE675_Duplicate_Operations_on_Resource__fopen_52a.c

In [None]:
#cd to check creation of json AST (C/testcases/CWE675_Duplicate_Operations_on_Resource/CWE675_Duplicate_Operations_on_Resource__fopen_52a.json )
%cd C/testcases/CWE675_Duplicate_Operations_on_Resource/

/content/C/testcases/CWE675_Duplicate_Operations_on_Resource


In [None]:
!ls

In [None]:
#back to home dir from which path begins
%cd ..

/content


In [None]:
#Create list of all compiled juliet code along with c and cpp compiled code for easy parsing with appropriate tools


compilable_juliet_code = []
compilable_juliet_c_code = []
compilable_juliet_cpp_code = []
paths = Path('./C/testcases').glob('**/*.o')
for path in paths:
  if re.search(r'CWE',str(path.name)):
    #.name to make sure the last part of the name i.e. before the suffix is verified
    compilable_juliet_code.append(str(path))
    if Path(str(path)[:-2]+'.c').exists():
      #Back to Path allows you to use the .exists atribute, while str allows -2 for ".o" and + operator for +'.c'
      compilable_juliet_c_code.append(str(path)[:-2]+'.c')
    if Path(str(path)[:-2]+'.cpp').exists():
      compilable_juliet_cpp_code.append(str(path)[:-2]+'.cpp')
 
  # str because path is an object not string
print("Total of {} compilable code:".format(len(compilable_juliet_code)))
print(compilable_juliet_code[0])
print("\n{} of the compilable Juliet code is C:".format(len(compilable_juliet_c_code)))
print(compilable_juliet_c_code[0])
print("\n{} of the compilable Juliet code is C++:".format(len(compilable_juliet_cpp_code)))
print(compilable_juliet_cpp_code[0])
print('\n35551 + 30441 = {} matching the total compilable code in the Juliet dataset'.format(35551 + 30441))
print('\n')

In [None]:
#Generate the rest of the json ASTs from the Juliet Dataset 
for i in range(len(compilable_juliet_c_code)):
    subprocess.run(["python3", "multi_file_c_json.py", compilable_juliet_c_code[i]])
print('Done generating Juliet C json ASTs.  Verify with ls cmd')  

Done generating Juliet C json ASTs.  Verify with ls cmd


In [None]:
#cd to testcases and use cmd ls to very presence of json ASTs
!ls

C      multi_file_c_json.py		    pycparser
drive  phase1_processed_juliet_dataset.zip  sample_data


In [None]:
%cd C/testcases/

/content/C/testcases


In [None]:
!ls

In [None]:
print(CWE_makefile_dir)

In [None]:
%cd CWE484_Omitted_Break_Statement_in_Switch/

In [None]:
!ls

In [None]:
%cd ..

/content/C/testcases


In [None]:
!ls

In [None]:
%cd CWE124_Buffer_Underwrite/

/content/C/testcases/CWE124_Buffer_Underwrite


In [None]:
!ls

s01  s02  s03  s04


In [None]:
%cd s01/

/content/C/testcases/CWE124_Buffer_Underwrite/s01


In [None]:
!ls

In [None]:
%cd ..

/content


In [None]:
#create a list of the Juliet C Json ASTs
juliet_c_ast_jsons = []
juliet_json_paths = Path('./C/testcases').glob('**/*.json')
for path in juliet_json_paths:
  juliet_c_ast_jsons.append(str(path))
  # str because path is an object not string
print("{} jsons of C ASTs from the programming submissions:".format(len(juliet_c_ast_jsons)))
print(juliet_c_ast_jsons[0])
print('\n')   

In [None]:
with open('compilable_juliet_code_ls_file', 'wb') as fp:
  pickle.dump (compilable_juliet_code, fp)

In [None]:
with open('compilable_juliet_c_code_ls_file', 'wb') as fp:
  pickle.dump (compilable_juliet_c_code, fp)

In [None]:
with open('compilable_juliet_cpp_code_ls_file', 'wb') as fp:
  pickle.dump (compilable_juliet_cpp_code, fp)

In [None]:
with open('juliet_c_ast_jsons_ls_file', 'wb') as fp:
  pickle.dump (juliet_c_ast_jsons, fp)

In [None]:
!ls

C				    juliet_c_ast_jsons_ls_file
compilable_juliet_c_code_ls_file    multi_file_c_json.py
compilable_juliet_code_ls_file	    phase1_processed_juliet_dataset.zip
compilable_juliet_cpp_code_ls_file  pycparser
drive				    sample_data


    Curating the compiled files and metadata for further processing and use in  building models     

**3.3 Curating processed Programming Competition submissions** 
----

In [None]:
#move pickled metadata file into folders for compression and curating for PHASE 2 of Big Data Cleaning Pipelne
%cp decodable_submisisons_ls_file submisisons_ls_file cpp_compiled_ls_file c_compiled_ls_file programming_competition_c_ast_jsons_ls_file c_json.py ProgramData/

In [None]:
#Creating tar archive of processed Progamming competion dataset and metadata 
shutil.make_archive('phase1_programing_competition','tar','/content/','ProgramData')

'/content/phase1_programing_competition.tar'

In [None]:
#Verify tarball of processed files
check_processed_archive = []
with tarfile.open('phase1_programing_competition.tar', 'r') as submissions:
  for member in submissions:
    if member.isdir() and member.name.count ('/') > 0:
     # The '/' > 0 is to ensure the root directory is ignored 
       print(member.name) 
    if member.isfile():
     check_processed_archive.append (member.name)
print("\n Number of files in tarball for export {0}".format(len(check_processed_archive)))

In [None]:
!cp phase1_programing_competition.tar "/content/drive/My Drive/2020/Fall2020/individual_lab_members/ebenx007/"

**3.4 Curating processed juliet source code** 
----

In [None]:
!ls

C				    juliet_c_ast_jsons_ls_file
compilable_juliet_c_code_ls_file    multi_file_c_json.py
compilable_juliet_code_ls_file	    phase1_processed_juliet_dataset.zip
compilable_juliet_cpp_code_ls_file  pycparser
drive				    sample_data


In [None]:
#Copy of files processed on Dec 3, 2020 
%cp -r compilable_juliet_code_ls_file compilable_juliet_c_code_ls_file compilable_juliet_cpp_code_ls_file juliet_c_ast_jsons_ls_file multi_file_c_json.py pycparser/  /content/C/

In [None]:
#NOT DONE move pickled metadata file into folders for compression and curating for PHASE 2 of Big Data Cleaning Pipelne NOT DONE
%cp juliet_dataset_CWE_testcases_paths_ls_file juliet_dataset_CWE_ls_file juliet_dataset_ls_file CWE_makefile_dir_ls_file /content/C/

In [None]:
#zipping processed juliet dataset and metadata
shutil.make_archive('curated_processed_juliet_dataset','zip','/content/','C')

'/content/curated_processed_juliet_dataset.zip'

In [None]:
#Verify zipped archive of processed juliet dataset
processed_juliet_dataset_ls = []
with zipfile.ZipFile('curated_processed_juliet_dataset.zip', 'r') as jz:
  for member in jz.namelist():
    if member.endswith('/') and member.count('/') > 2:
      # > 2  '/' count to ignore none testcases related directories in the archive
      print( member)  
    if (not member.endswith('/')) and member.count('/') > 2:
      # > 2 '/' count to append to list only files in testcases directories, ignoring files inthe root directory  
      processed_juliet_dataset_ls.append(member)
    processed_juliet_dataset_ls.append(member) 
print("Number of processed Juliet C/C++ test files and metadata {0}".format(len(processed_juliet_dataset_ls)))

In [None]:
!cp curated_processed_juliet_dataset.zip "/content/drive/My Drive/2020/Fall2020/individual_lab_members/ebenx007/"