# Graphing module

## Mapping and syntax processing (AST)

### Syntax processing (AST) printing the AST
- Here there are 3 main options available (that I currently know of)
    - ANTLR (more community support)
    - tree-sitter (speed, lazy-evaluation)
    - Bison (more for config files)

Because currently we have the most done on tree-sitter, the focus will be on that, but it's always possible to switch or write a custom AST Parser since our usecase is quite unique

Printing the base AST with nice performance (~0.5 s for entire repo-review repo)
521 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [90]:
from dataclasses import dataclass
import sys
from enum import unique, Enum
from unicodedata import name

# This is used to be able to import from src directory
sys.path.append("../")


from src.repository_processing import files_from_repository
from tree_sitter_languages import get_language, get_parser
from tree_sitter import Language, Parser, Node
from abc import ABCMeta, abstractmethod, abstractproperty


# TODO: Add Result monad to properties to handle errors
# @property is not cached, if more performance needed use @functools.cached_property
@dataclass
class LanguageInfo(metaclass=ABCMeta):
    """abstract base class for holding language information and parsers for AST"""

    @property
    @abstractmethod
    def name() -> str:
        ...

    @property
    @abstractmethod
    def extensions() -> tuple[str]:
        ...

    @property
    def language(self) -> Language:
        return get_language(self.name)

    @property
    def parser(self) -> Parser:
        return get_parser(self.name)


class Python(LanguageInfo):
    name: str = "python"
    extensions: tuple[str] = ".py"

class Java(LanguageInfo):
    name: str = "java"
    extensions: tuple[str] = ".java"


#TODO: add error handling with Result monad
def get_language_info(file_path: str) -> LanguageInfo:
    """pick the correct language class based on the file extension"""
    for language_class in LanguageInfo.__subclasses__():
        if file_path.endswith(language_class.extensions):
            return language_class()
    raise ValueError(f"Language for extension found in {file_path} not implemented")


def print_tree_structure(node: Node, depth: int = 0):
    """print the structure of the AST"""
    print(f"{depth}{'  ' * depth}{node.type} {node.id} {node.text}")
    for child in node.children:
        print_tree_structure(child, depth + 1)


files = files_from_repository("https://github.com/Foxicution/repo-review")

for file in files:
    print(file.path)
    try:
        language_info = get_language_info(file.path)
    except ValueError as e:
        continue
    tree = language_info.parser.parse(file.content)
    root_node = tree.root_node
    print_tree_structure(root_node)
    

/.flake8
/.gitignore
/.pre-commit-config.yaml
/README.md
/main.py
0module 94630775751856 b'import json\nimport os\nimport pickle\nimport re\nfrom functools import wraps\nfrom typing import Any, AnyStr, Callable, Optional, Pattern\n\nimport streamlit as st\nfrom github import Github, Repository\nfrom google.cloud import firestore\nfrom google.oauth2 import service_account\nfrom networkx import Graph\nfrom pyvis.network import Network\nfrom toolz.functoolz import pipe\n\nfrom python_components.large_lang_model import ai_magic\nfrom python_components.networkx_graphing import get_graphs\nfrom python_components.old_types import Package, T\nfrom src.streamlit_components.graph_visualizer import my_component\n\nst.set_page_config(layout=\'wide\')\nif \'init\' not in st.session_state:\n    st.session_state.init = True\n\n\ndef replace_semicolons_with_new_line(code: str) -> str:\n    return code.replace(\';\', \'\\n\')\n\n\ndef extract_and_remove_pattern(\n    pattern: Pattern[AnyStr], code: str

In [145]:
## Simplification of the code files

code = b"""
def add(a, b):
    return a + b

a = 3
add(a, 4)
"""

language_info = get_language_info(('.py'))
tree = language_info.parser.parse(code)
root_node = tree.root_node

function_query = """
(function_definition
    name: (identifier) @function_name) @function
"""

assignment_query = """
(expression_statement
    (assignment
        left: (identifier) @variable_name)) @assignment
"""

query = f"""
{function_query}
{assignment_query}"""

capture_nodes = language_info.language.query(query).captures(root_node)
print(capture_nodes)
print([node[0].text for node in capture_nodes])
print()

for child in root_node.children:
    print(child.text)
    print(child.sexp())

TypeError: First argument to captures must be a Node

### Processing the AST for a sigle file -> single file graph building
There are a few possible approaches for processing the AST:
1. Pruning
2. Statement by statement
3. Keyword based (current approach)

Algorithm for processing the file:

Take a statement -> write identifier into a dict pointing to the statement in file -> process the statement -> if identifier is in dict, add edge to the statement in the dict -> repeat
Main question now is how to do the correct compression of AST

Step 1: Get AST and a list of logic connection edges missing from it

In [70]:
# TODO: add a simple file for development and testing

from tree_sitter import Node
from networkx import Graph
from pyvis.network import Network
from tree_sitter import Language, Parser, Node
from typing import Tuple


def command_set(node: Node, set: set = set()) -> set:
    """print the command set of the AST"""
    set.add(node.type)
    for child in node.children:
        command_set(child)
    return set


def print_tree_structure(node: Node, depth: int = 0):
    pointers = {}

    match node.type:
        case "import_statement":
            print(f"{depth}{'  ' * depth}{node.type} {node.id} {node.text} HANDLED")
        case _:
            print(f"{depth}{'  ' * depth}{node.type} {node.id} {node.text}")
    for child in node.children:
        print_tree_structure(child, depth + 1)


# TODO: handle interfile dependencies
def handle_import_statement(node: Node, pointers: dict):
    """handle import statements"""
    match node.children[1].type:
        case "dotted_name":
            print(node.children[1].text)
        case "aliased_import":
            print(node.children[1].children[-1].text)


def handle_import_from_statement(node: Node, pointers: dict):
    """handle import from statements"""
    for child in node.children[3:]:
        if child.type == "dotted_name":
            print(child.text)


def handle_expression_statement(node: Node, pointers: dict):
    """handle expression statements"""
    match node.children[0].type:
        case "call":
            for child in node.children[0].children:
                print(child.type)


for file in files:
    # print(file.path)
    try:
        language_info = get_language_info(file.path)
    except ValueError as e:
        continue
    tree = language_info.parser.parse(file.content)
    root_node = tree.root_node
    import_query = """
    (import_statement
                name: (dotted_name)* @glob)
    """
    aliased_import_query = """
    (import_statement
        (aliased_import alias: (identifier) @glob))
    """
    import_from_query = """
    (import_from_statement
                name: (dotted_name) @glob)
    """
    expression_query = """
    (expression_statement
        (call function: (attribute object: (identifier) @call)
    """
    captured_nodes = language_info.language.query(
        f"""
        {import_from_query}
        """
    ).captures(root_node)

    pointers = {}
    for node, name in captured_nodes:
        print(node.text, name)
        

    print("=" * 20)
    for child in root_node.children:
        match child.type:
            case "import_statement":
                ...
                # handle_import_statement(child, {})
                # print(child.text)
                # print(child.sexp())
            case "import_from_statement":
                ...
                # handle_import_from_statement(child, {})
                # print(child.sexp())
            case "expression_statement":
                # handle_expression_statement(child, {})
                print(child.text)
                print(child.sexp())
                print("=" * 20)
            case _:
                print(child.type)
                print(child.sexp())
    # handle_logic(root_node)
    # print_tree_structure(root_node)
    # print(command_set(root_node))
    break


b'wraps' glob
b'Any' glob
b'AnyStr' glob
b'Callable' glob
b'Optional' glob
b'Pattern' glob
b'Github' glob
b'Repository' glob
b'firestore' glob
b'service_account' glob
b'Graph' glob
b'Network' glob
b'pipe' glob
b'ai_magic' glob
b'get_graphs' glob
b'Package' glob
b'T' glob
b'my_component' glob
b"st.set_page_config(layout='wide')"
(expression_statement (call function: (attribute object: (identifier) attribute: (identifier)) arguments: (argument_list (keyword_argument name: (identifier) value: (string)))))
if_statement
(if_statement condition: (comparison_operator (string) (attribute object: (identifier) attribute: (identifier))) consequence: (block (expression_statement (assignment left: (attribute object: (attribute object: (identifier) attribute: (identifier)) attribute: (identifier)) right: (true)))))
function_definition
(function_definition name: (identifier) parameters: (parameters (typed_parameter (identifier) type: (type (identifier)))) return_type: (type (identifier)) body: (block

In [None]:
python_language = Python()

import networkx as nx
from pyvis.network import Network


def print_tree_structure(node: Node, graph: nx.Graph):
    # Print the node's value
    print(node.id, node.type, node.text)

    # Add the node to the graph
    graph.add_node(node.id, title=str(node.text))

    # Recursively print the structure of the node's children
    for child in node.children:
        print_tree_structure(child, graph)
        graph.add_edge(node.id, child.id)


def process_dependancy_logic(node: Node, graph: nx.Graph):
    # Add the node to the graph
    graph.add_node(node.id, title=str(node.text))

    # Check if the node is of a specific type that indicates dependancy
    if node.type in ["import_declaration", "function_call", "variable_reference"]:
        # Add an edge to the dependancy node in the graph
        dependancy_node = node.children[0]
        graph.add_edge(node.id, dependancy_node.id)

    # Check if the node is of a specific type that indicates logic
    elif node.type in ["if_statement", "for_statement", "while_statement"]:
        # Add an edge to the logic node in the graph
        logic_node = node.children[0]
        graph.add_edge(node.id, logic_node.id)

    # Recursively process the structure of the node's children
    for child in node.children:
        process_dependancy_logic(child, graph)


files = files_from_repository("https://github.com/Foxicution/repo-review")
for file in files:
    if file.path.endswith(python_language.extensions):
        print(file.path)
        # Parse the file and get the root node of the tree
        tree = python_language.parser.parse(file.content)
        root_node = tree.root_node

        # Create an empty graph
        graph = nx.Graph()

        # Walk through the tree and print its structure
        # print_tree_structure(root_node, graph)
        process_dependancy_logic(root_node, graph)

        # Draw the graph
        nt = Network("1000px", "2000px")
        nt.options.physics.enabled = False
        nt.show_buttons()
        # populates the nodes and edges data structures
        nt.from_nx(graph)
        nt.show("nx.html")
        break