In [None]:
'''
To use the phraser we need build the tool first

git clone https://github.com/tree-sitter/tree-sitter-c.git

git clone https://github.com/tree-sitter/tree-sitter-cpp.git

git clone https://github.com/tree-sitter/tree-sitter-python.git


'''

In [4]:
import os
import json
from tree_sitter import Language, Parser

# Load the languages
Language.build_library(
  'build/experimental-phraser.so',
  ['tree-sitter-c', 'tree-sitter-cpp', 'tree-sitter-python']
)

C_LANGUAGE = Language('build/experimental-phraser.so', 'c')
CPP_LANGUAGE = Language('build/experimental-phraser.so', 'cpp')
PYTHON_LANGUAGE = Language('build/experimental-phraser.so', 'python')

def extract_info_v0(file_path, language):
    with open(file_path, 'r', encoding='utf-8') as f:
        code = f.read()

    parser = Parser()
    parser.set_language(language)

    tree = parser.parse(bytes(code, "utf8"))

    # Get the root node of the syntax tree
    root_node = tree.root_node

    functions = []
    classes = []

    def extract(node, field):
        # This function extracts the text given a node
        return code[node.start_byte:node.end_byte]

    for child in root_node.children:
        if child.type == 'function_definition':
            # The function name is no longer the first child of the function definition
            for node in child.children:
                if node.type == 'identifier':
                    functions.append(extract(node, code))
                    break  # We break after finding the first identifier
        elif child.type in ['class_specifier', 'class_definition']:
            # The class name is still the first child of the class specifier or definition
            for node in child.children:
                if node.type == 'identifier':
                    classes.append(extract(node, code))
                    break  # We break after finding the first identifier

    if functions or classes:
        return {'file_path': file_path, 'functions': functions, 'classes': classes}


In [24]:
def extract_info_cpp(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        class_specifier
        name: (type_identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    classes = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'classes': classes}


In [18]:
def extract_info_py(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_definition
        name: (identifier) @function.def
    )

    (
        class_definition
        name: (identifier) @class.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    classes = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'class.def']

    return {'file_path': file_path, 'functions': functions, 'classes': classes}


In [25]:
def extract_info_c(file_path, language):
    with open(file_path, 'r') as file:
        code = file.read().encode('utf-8')
    parser = Parser()
    parser.set_language(language)
    tree = parser.parse(code)

    query_string = """
    (
        function_declarator
        declarator: (identifier) @function.def
    )

    (
        struct_specifier
        name: (type_identifier) @struct.def
    )
    """
    query = language.query(query_string)
    captures = query.captures(tree.root_node)

    functions = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'function.def']
    structs = [code[node.start_byte:node.end_byte].decode() for node, capture in captures if capture == 'struct.def']

    return {'file_path': file_path, 'functions': functions, 'structs': structs}



In [26]:
def record_code_elements(root_dir):
    data = []

    for dir_name, sub_dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.c'):
                info = extract_info_c(os.path.join(dir_name, file), C_LANGUAGE)
                if info:
                    data.append(info)
            elif file.endswith('.cpp'):
                info = extract_info_cpp(os.path.join(dir_name, file), CPP_LANGUAGE)
                if info:
                    data.append(info)
            elif file.endswith('.py'):
                info = extract_info_py(os.path.join(dir_name, file), PYTHON_LANGUAGE)
                if info:
                    data.append(info)

    with open('code_elements_tree_phraser.json', 'w') as f:
        json.dump(data, f, indent=4)


In [27]:
record_code_elements('../pytorch')