In [1]:
import os
import json
from tree_sitter import Language, Parser

# Load the languages
Language.build_library(
  'build/experimental-phraser.so',
  ['tree-sitter-c', 'tree-sitter-cpp', 'tree-sitter-python']
)

C_LANGUAGE = Language('build/experimental-phraser.so', 'c')
CPP_LANGUAGE = Language('build/experimental-phraser.so', 'cpp')
PYTHON_LANGUAGE = Language('build/experimental-phraser.so', 'python')

In [2]:
file_path='FlushDenormal.cpp'
with open(file_path, 'r', encoding='utf-8') as f:
    code = f.read()

parser = Parser()
parser.set_language(CPP_LANGUAGE)

tree = parser.parse(bytes(code, "utf8"))

# Get the root node of the syntax tree
root_node = tree.root_node

def extract(node, field):
    # This function extracts the text given a node
    return code[node.start_byte:node.end_byte]
'''
    for child in root_node.children:
        if child.type == 'function_definition':
            # The function name is no longer the first child of the function definition
            for node in child.children:
                if node.type == 'identifier':
                    functions.append(extract(node, code))
                    break  # We break after finding the first identifier
        elif child.type in ['class_specifier', 'class_definition']:
            # The class name is still the first child of the class specifier or definition
            for node in child.children:
                if node.type == 'identifier':
                    classes.append(extract(node, code))
                    break  # We break after finding the first identifier
'''

"\n    for child in root_node.children:\n        if child.type == 'function_definition':\n            # The function name is no longer the first child of the function definition\n            for node in child.children:\n                if node.type == 'identifier':\n                    functions.append(extract(node, code))\n                    break  # We break after finding the first identifier\n        elif child.type in ['class_specifier', 'class_definition']:\n            # The class name is still the first child of the class specifier or definition\n            for node in child.children:\n                if node.type == 'identifier':\n                    classes.append(extract(node, code))\n                    break  # We break after finding the first identifier\n"

In [7]:
root_node.children

[<Node type=preproc_include, start_point=(0, 0), end_point=(2, 0)>,
 <Node type=preproc_include, start_point=(2, 0), end_point=(3, 0)>,
 <Node type=preproc_include, start_point=(3, 0), end_point=(5, 0)>,
 <Node type=namespace_definition, start_point=(5, 0), end_point=(31, 2)>,
 <Node type=comment, start_point=(31, 4), end_point=(31, 24)>]

In [8]:
root_node.children[3].children

[<Node type="namespace", start_point=(5, 0), end_point=(5, 9)>,
 <Node type=namespace_identifier, start_point=(5, 10), end_point=(5, 12)>,
 <Node type=declaration_list, start_point=(5, 13), end_point=(31, 2)>]

In [9]:
root_node.children[3].children[2]

<Node type=declaration_list, start_point=(5, 13), end_point=(31, 2)>

In [10]:
root_node.children[3].children[2].children

[<Node type="{", start_point=(5, 13), end_point=(5, 14)>,
 <Node type=namespace_definition, start_point=(5, 15), end_point=(31, 1)>,
 <Node type="}", start_point=(31, 1), end_point=(31, 2)>]

In [11]:
root_node.children[3].children[2].children[1].children

[<Node type="namespace", start_point=(5, 15), end_point=(5, 24)>,
 <Node type=namespace_identifier, start_point=(5, 25), end_point=(5, 28)>,
 <Node type=declaration_list, start_point=(5, 29), end_point=(31, 1)>]

In [12]:
root_node.children[3].children[2].children[1].children[2].children

[<Node type="{", start_point=(5, 29), end_point=(5, 30)>,
 <Node type=declaration, start_point=(7, 0), end_point=(7, 54)>,
 <Node type=declaration, start_point=(8, 0), end_point=(8, 50)>,
 <Node type=function_definition, start_point=(10, 0), end_point=(29, 1)>,
 <Node type="}", start_point=(31, 0), end_point=(31, 1)>]

In [23]:
def extract_info_v0(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        class_specifier
        name: (type_identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [node.text(code) for node, capture in captures if capture == 'function.def']
    classes = [node.text(code) for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'classes': classes}



In [24]:
extract_info_v0(file_path,CPP_LANGUAGE)

TypeError: 'bytes' object is not callable

In [25]:
def extract_info_cpp(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        class_specifier
        name: (type_identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    classes = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'classes': classes}


In [26]:
extract_info_cpp(file_path,CPP_LANGUAGE)

{'file_path': 'FlushDenormal.cpp',
 'functions': ['set_flush_denormal'],
 'classes': []}

In [30]:
def extract_info_py(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_definition
        name: (identifier) @function.def
    )

    (
        class_definition
        name: (identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    classes = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'classes': classes}


In [31]:
extract_info_py('tensor.py',PYTHON_LANGUAGE)

{'file_path': 'tensor.py',
 'functions': ['_wrap_type_error_to_not_implemented',
  'wrapped',
  '_rebuild_from_type',
  '__deepcopy__',
  '__reduce_ex__',
  '_reduce_ex_internal',
  '__setstate__',
  '__repr__',
  'backward',
  'register_hook',
  'reinforce',
  'trim',
  'retain_grad',
  'retain_grad_hook',
  'is_shared',
  'share_memory_',
  '__reversed__',
  'norm',
  'lu',
  'stft',
  'istft',
  'resize',
  'resize_as',
  'split',
  'unique',
  'unique_consecutive',
  '__rsub__',
  '__rdiv__',
  '__format__',
  '__ipow__',
  '__rpow__',
  '__floordiv__',
  '__rfloordiv__',
  '__len__',
  '__iter__',
  '__hash__',
  '__dir__',
  '__array__',
  '__array_wrap__',
  '__contains__',
  '__cuda_array_interface__',
  'refine_names',
  'align_to',
  'unflatten',
  'rename_',
  'rename',
  '_update_names',
  'grad',
  'grad',
  'grad',
  '__torch_function__',
  '_convert'],
 'classes': ['Tensor']}

In [34]:
def extract_info_c(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        struct_specifier
        name: (type_identifier) @struct.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    structs = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'struct.def']

    return {'file_path': file_path, 'functions': functions, 'structs': structs}



In [35]:
extract_info_c('miniz.c',C_LANGUAGE)

{'file_path': 'miniz.c',
 'functions': ['mz_adler32',
  'mz_crc32',
  'mz_crc32',
  'mz_crc32',
  'mz_free',
  'miniz_def_alloc_func',
  'miniz_def_free_func',
  'miniz_def_realloc_func',
  'mz_version',
  'mz_deflateInit',
  'mz_deflateInit2',
  'mz_deflateReset',
  'mz_deflate',
  'mz_deflateEnd',
  'mz_deflateBound',
  'mz_compress2',
  'mz_compress',
  'mz_compressBound',
  'mz_inflateInit2',
  'mz_inflateInit',
  'mz_inflate',
  'mz_inflateEnd',
  'mz_uncompress',
  'mz_error',
  'tdefl_radix_sort_syms',
  'tdefl_calculate_minimum_redundancy',
  'tdefl_huffman_enforce_max_code_size',
  'tdefl_optimize_huffman_table',
  'tdefl_start_dynamic_block',
  'tdefl_start_static_block',
  'tdefl_compress_lz_codes',
  'tdefl_compress_lz_codes',
  'tdefl_compress_block',
  'tdefl_flush_block',
  'TDEFL_READ_UNALIGNED_WORD',
  'TDEFL_READ_UNALIGNED_WORD2',
  'tdefl_find_match',
  'tdefl_find_match',
  'tdefl_compress_fast',
  'tdefl_record_literal',
  'tdefl_record_match',
  'tdefl_compress_no

In [14]:
def extract_info_c_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        struct_specifier
        name: (type_identifier) @struct.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.parent.start_byte:node.parent.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    structs = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'struct.def']

    return {'file_path': file_path, 'functions': functions, 'structs': structs}



In [15]:
extract_info_c_def('miniz.c',C_LANGUAGE)

{'file_path': 'miniz.c',
 'functions': [{'name': 'mz_adler32',
   'definition': 'mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len)\n{\n    mz_uint32 i, s1 = (mz_uint32)(adler & 0xffff), s2 = (mz_uint32)(adler >> 16);\n    size_t block_len = buf_len % 5552;\n    if (!ptr)\n        return MZ_ADLER32_INIT;\n    while (buf_len)\n    {\n        for (i = 0; i + 7 < block_len; i += 8, ptr += 8)\n        {\n            s1 += ptr[0], s2 += s1;\n            s1 += ptr[1], s2 += s1;\n            s1 += ptr[2], s2 += s1;\n            s1 += ptr[3], s2 += s1;\n            s1 += ptr[4], s2 += s1;\n            s1 += ptr[5], s2 += s1;\n            s1 += ptr[6], s2 += s1;\n            s1 += ptr[7], s2 += s1;\n        }\n        for (; i < block_len; ++i)\n            s1 += *ptr++, s2 += s1;\n        s1 %= 65521U, s2 %= 65521U;\n        buf_len -= block_len;\n        block_len = 5552;\n    }\n    return (s2 << 16) + s1;\n}'},
  {'name': 'mz_crc32',
   'definition': 'mz_ulong mz_

In [12]:
def extract_info_cpp_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        class_specifier
        name: (type_identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.parent.start_byte:node.parent.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    classes = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'structs': classes}


In [13]:
extract_info_cpp_def('FlushDenormal.cpp',CPP_LANGUAGE)

{'file_path': 'FlushDenormal.cpp',
 'functions': [{'name': 'set_flush_denormal',
   'definition': 'bool set_flush_denormal(bool on) {\n  // Compile if we have SSE support (GCC), x86-64 (MSVC), or x86 with SSE (MSVC)\n#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)\n  // Denormals-Are-Zero is supported by most SSE2 processors, with the exception\n  // of some early Pentium 4 processors. We guard it with a runtime check.\n  // Flush-To-Zero (FTZ) only requires SSE.\n  if (cpuinfo_has_x86_daz()) {\n    unsigned int csr = _mm_getcsr();\n    csr &= ~DENORMALS_ZERO;\n    csr &= ~FLUSH_ZERO;\n    if (on) {\n      csr |= DENORMALS_ZERO;\n      csr |= FLUSH_ZERO;\n    }\n    _mm_setcsr(csr);\n    return true;\n  }\n#endif\n  return false;\n}'}],
 'structs': []}

In [20]:
def extract_info_py_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')

    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_definition
        name: (identifier) @function.def
    )

    (
        class_definition
        name: (identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    classes = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'structs': classes}


In [21]:
extract_info_py_def('tensor.py',PYTHON_LANGUAGE)

{'file_path': 'tensor.py',
 'functions': [{'name': '_wrap_type_error_to_not_implemented',
   'definition': "def _wrap_type_error_to_not_implemented(f):\n    # functools.wraps doesn't work well with methods in python 2\n    method_assignments = ('__name__', '__doc__')\n    assigned = functools.WRAPPER_ASSIGNMENTS\n\n    @functools.wraps(f, assigned=assigned)\n    def wrapped(*args, **kwargs):\n        if has_torch_function(args):\n            return handle_torch_function(wrapped, args, *args, **kwargs)\n        try:\n            return f(*args, **kwargs)\n        except TypeError:\n            return NotImplemented\n    return wrapped"},
  {'name': 'wrapped',
   'definition': 'def wrapped(*args, **kwargs):\n        if has_torch_function(args):\n            return handle_torch_function(wrapped, args, *args, **kwargs)\n        try:\n            return f(*args, **kwargs)\n        except TypeError:\n            return NotImplemented'},
  {'name': '_rebuild_from_type',
   'definition': 'def 