This notebook shows an example of extracting and plotting an AST from a source code.

In this notebook some functions are written with some help from the following documentations as well as tutorials: 

Clang documentation:

https://libclang.readthedocs.io/en/latest/index.html

An example of parsing C ++ code using libclang in Python:

https://sudonull.com/post/907-An-example-of-parsing-C-code-using-libclang-in-Python

Clang tutorial:

https://jonasdevlieghere.com/understanding-the-clang-ast/#cursors

In [36]:
import clang.cindex
import pandas as pd
import logging

import graphviz

logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=logging.DEBUG)

graphviz.__version__

'0.16'

In [37]:
vdisc = pd.read_csv("./Dataset/vdisc_buffer_examples.csv.gz")

In [38]:
vdisc.head()

Unnamed: 0.1,Unnamed: 0,testCase_ID,filename,code,bug,type
0,0,0,0_vdisc.c,"create_NUBspline_1d_z (NUgrid* x_grid, BCtype_...",True,CWE_120
1,1,1,1_vdisc.c,m92_startup(void)\r\n{\r\n\tunsigned char *RAM...,True,CWE_120
2,2,2,2_vdisc.c,"chirp_multi_lookup(const char *volume, const c...",True,CWE_120
3,3,3,3_vdisc.c,"_parse(VisuConfigFileEntry *entry, gchar **tok...",True,CWE_120
4,4,4,4_vdisc.c,CDE_create_path_symlink_dirs() {\n char *p;\n...,True,CWE_120


In [39]:
vdisc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95320 entries, 0 to 95319
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   95320 non-null  int64 
 1   testCase_ID  95320 non-null  int64 
 2   filename     95320 non-null  object
 3   code         95320 non-null  object
 4   bug          95320 non-null  bool  
 5   type         95320 non-null  object
dtypes: bool(1), int64(2), object(3)
memory usage: 3.7+ MB


In [78]:
vdisc_example = vdisc.iloc[95300]

In [79]:
print(vdisc_example.code)

GetFileType(gpointer handle)
{
	WapiHandleType type;

	if (!_WAPI_PRIVATE_HAVE_SLOT (handle)) {
		SetLastError (ERROR_INVALID_HANDLE);
		return(FILE_TYPE_UNKNOWN);
	}

	type = _wapi_handle_type (handle);
	
	if (io_ops[type].getfiletype == NULL) {
		SetLastError (ERROR_INVALID_HANDLE);
		return(FILE_TYPE_UNKNOWN);
	}
	
	return(io_ops[type].getfiletype ());
}


Parsing Source code and extracting AST using Clang 

In [24]:
index = clang.cindex.Index.create()
translation_unit = index.parse(path=vdisc_example.filename, unsaved_files=[(vdisc_example.filename, vdisc_example.code)])
ast_root = translation_unit.cursor

To see the tokens

In [25]:
for i in translation_unit.get_tokens(extent=translation_unit.cursor.extent):
    print (i.kind)

TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.KEYWORD
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.LITERAL
TokenKind.PUNCTUATION
TokenKind.LITERAL
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.KEYWORD
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.KEYWORD
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.IDENTIFIER
TokenKind.PUNCTUATION
TokenKind.LITERAL
TokenKind.PUNCTUATION
TokenKind.PUNCTUATION
TokenKind.LITERAL
TokenKind.PUNCTUATION
TokenKi

# Print AST

In [26]:
def print_ast(cursor, deep=0):
    print(' '.join((deep*'    ', str(cursor.kind), str(cursor.spelling))))
    for child in cursor.get_children():
        print_ast(child, deep+1)

print_ast(ast_root)

 CursorKind.TRANSLATION_UNIT 10_vdisc.c
     CursorKind.FUNCTION_DECL test_date_rfc2822__format_rfc2822_positive_offset
         CursorKind.COMPOUND_STMT 
             CursorKind.DECL_STMT 
                 CursorKind.VAR_DECL buf


# Visualizing AST'S

In [27]:
def get_ast(example):
    index = clang.cindex.Index.create()
    translation_unit = index.parse(path=example.filename, unsaved_files=[(example.filename, example.code)])
    ast_root = translation_unit.cursor
    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    return ast_root

In [28]:
def save_ast(node):
   
    node.children = list(node.get_children())

    for child in node.children:
        counter = save_ast(child)

In [29]:
def numbering_ast_nodes(node, counter=1):
  
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

In [30]:
def generate_edgelist(ast_root):
 
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

In [31]:
def generate_features(ast_root):
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree

        features[node.identifier] = [degree, str(node.kind), node.displayname]

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

In [32]:
def ast_to_dot(ast_root):
    ast_edgelist = '\n                '.join([
        make_dot_edge((str(start), str(end)))
        for start, end in generate_edgelist(ast_root)
    ])
    node_labels = '\n                '.join([
        str(node_id)
        + ' [label="' + str(node_features[1].replace("CursorKind.", "")) + " "+ str(node_features[2]) + '"]'
        for node_id, node_features in generate_features(ast_root).items()
    ])
    return """
        digraph g {
            {  # NODE LABELS
                
                """ + node_labels + """
            }
            {  # Edges
               
                """ + ast_edgelist + """
            }
        }
    """

In [33]:
def make_dot_edge(edge):
    start, end = edge
    return start + " -> " + end

In [34]:
vdisc_ast_example = get_ast(vdisc_example)

In [35]:
vdisc_ast_dot= ast_to_dot(vdisc_ast_example)
print(vdisc_ast_dot)


        digraph g {
            {  # NODE LABELS
                
                1 [label="TRANSLATION_UNIT 10_vdisc.c"]
                2 [label="FUNCTION_DECL test_date_rfc2822__format_rfc2822_positive_offset()"]
                3 [label="COMPOUND_STMT "]
                4 [label="DECL_STMT "]
                5 [label="VAR_DECL buf"]
            }
            {  # Edges
               
                1 -> 2
                2 -> 3
                3 -> 4
                4 -> 5
            }
        }
    


In [196]:
dot = graphviz.Digraph()


dot.node('1', 'TRANSLATION_UNIT 95300_vdisc.c"')
dot.node('2', 'FUNCTION_DECL GetFileType(int)')
dot.node('3', 'PARM_DECL handle')
dot.node('4', 'COMPOUND_STMT')
dot.node('5', 'IF_STMT')
dot.node('6', 'UNEXPOSED_EXPR')
dot.node('7', 'COMPOUND_STMT')
dot.node('8', 'IF_STMT')
dot.node('9', 'UNEXPOSED_EXPR')
dot.node('10', 'COMPOUND_STMT')



dot.edge('1', '2', constraint='true')
dot.edge('2', '3', constraint='true')
dot.edge('2', '4', constraint='true')
dot.edge('4', '5', constraint='true')
dot.edge('5', '6', constraint='true')
dot.edge('5', '7', constraint='true')
dot.edge('4', '8', constraint='true')
dot.edge('8', '9', constraint='true')
dot.edge('8', '10', constraint='true')


dot
dot.save('test2.dot','./Dataset/')

[DEBUG@graphviz.files] write 532 bytes to './Dataset/test2.dot'


'./Dataset/test2.dot'