In [None]:
'''
To use the phraser we need build the tool first

git clone https://github.com/tree-sitter/tree-sitter-c.git

git clone https://github.com/tree-sitter/tree-sitter-cpp.git

git clone https://github.com/tree-sitter/tree-sitter-python.git


'''

In [1]:
import os
import json
from tree_sitter import Language, Parser

# Load the languages
Language.build_library(
  'build/experimental-phraser.so',
  ['tree-sitter-c', 'tree-sitter-cpp', 'tree-sitter-python']
)

C_LANGUAGE = Language('build/experimental-phraser.so', 'c')
CPP_LANGUAGE = Language('build/experimental-phraser.so', 'cpp')
PYTHON_LANGUAGE = Language('build/experimental-phraser.so', 'python')

In [7]:
def extract_info_c_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        struct_specifier
        name: (type_identifier) @struct.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.parent.start_byte:node.parent.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    structs = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'struct.def']

    return {'file_path': file_path, 'functions': functions, 'structs': structs}



In [3]:
def extract_info_cpp_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        class_specifier
        name: (type_identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.parent.start_byte:node.parent.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    structs = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'structs': structs}


In [4]:
def extract_info_py_def(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_definition
        name: (identifier) @function.def
    )

    (
        class_definition
        name: (identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'function.def']
    classes = [{'name': code[node.start_byte:node.end_byte].decode(), 'definition': code[node.parent.start_byte:node.parent.end_byte].decode()} for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'structs': classes}


In [5]:
def record_code_elements(root_dir):
    data = []

    for dir_name, sub_dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.c'):
                info = extract_info_c_def(os.path.join(dir_name, file), C_LANGUAGE)
                if info:
                    data.append(info)
            elif file.endswith('.cpp'):
                info = extract_info_cpp_def(os.path.join(dir_name, file), CPP_LANGUAGE)
                if info:
                    data.append(info)
            elif file.endswith('.py'):
                info = extract_info_py_def(os.path.join(dir_name, file), PYTHON_LANGUAGE)
                if info:
                    data.append(info)

    with open('code_elements_tree_phraser_with_code.json', 'w') as f:
        json.dump(data, f, indent=4)


In [8]:
record_code_elements('../pytorch')