diff --git a/.gitignore b/.gitignore index ea782b07..89ef3acf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ venv/ +.pytest_cache/ .ruff_cache/ .vscode/ +.coverage __pycache__/ \ No newline at end of file diff --git a/prometheus/knowledge_graph/__init__.py b/prometheus/graph/__init__.py similarity index 100% rename from prometheus/knowledge_graph/__init__.py rename to prometheus/graph/__init__.py diff --git a/prometheus/graph/file_graph_builder.py b/prometheus/graph/file_graph_builder.py new file mode 100644 index 00000000..88fdcfcf --- /dev/null +++ b/prometheus/graph/file_graph_builder.py @@ -0,0 +1,136 @@ +from collections import deque +from pathlib import Path +from typing import Sequence, Tuple + +from langchain_core.documents import Document +from langchain_text_splitters import MarkdownHeaderTextSplitter + +from prometheus.graph.graph_types import ( + ASTNode, + KnowledgeGraphEdge, + KnowledgeGraphEdgeType, + KnowledgeGraphNode, + TextNode, +) +from prometheus.parser import tree_sitter_parser + + +class FileGraphBuilder: + def __init__(self, max_ast_depth: int): + self.max_ast_depth = max_ast_depth + + def supports_file(self, file: Path) -> bool: + if tree_sitter_parser.supports_file(file): + return True + + if file.suffix == ".md": + return True + + return False + + def build_file_graph( + self, parent_node: KnowledgeGraphNode, file: Path, next_node_id: int + ) -> Tuple[int, Sequence[KnowledgeGraphNode], Sequence[KnowledgeGraphEdge]]: + if tree_sitter_parser.supports_file(file): + return self._tree_sitter_file_graph(parent_node, file, next_node_id) + + if file.suffix == ".md": + return self._markdown_file_graph(parent_node, file, next_node_id) + + def _tree_sitter_file_graph( + self, parent_node: KnowledgeGraphNode, file: Path, next_node_id: int + ) -> Tuple[int, Sequence[KnowledgeGraphNode], Sequence[KnowledgeGraphEdge]]: + tree_sitter_nodes = [] + tree_sitter_edges = [] + + tree = tree_sitter_parser.parse(file) + if tree.root_node.has_error or tree.root_node.child_count == 0: + return next_node_id, tree_sitter_nodes, tree_sitter_edges + + ast_root_node = ASTNode( + type=tree.root_node.type, + start_line=tree.root_node.start_point[0], + end_line=tree.root_node.end_point[0], + text=tree.root_node.text.decode("utf-8"), + ) + kg_ast_root_node = KnowledgeGraphNode(next_node_id, ast_root_node) + next_node_id += 1 + tree_sitter_nodes.append(kg_ast_root_node) + tree_sitter_edges.append( + KnowledgeGraphEdge(parent_node, kg_ast_root_node, KnowledgeGraphEdgeType.has_ast) + ) + + node_stack = deque() + node_stack.append((tree.root_node, kg_ast_root_node, 1)) + while node_stack: + tree_sitter_node, kg_node, depth = node_stack.pop() + + if depth > self.max_ast_depth: + continue + + for tree_sitter_child_node in tree_sitter_node.children: + child_ast_node = ASTNode( + type=tree_sitter_child_node.type, + start_line=tree_sitter_child_node.start_point[0], + end_line=tree_sitter_child_node.end_point[0], + text=tree_sitter_child_node.text.decode("utf-8"), + ) + kg_child_ast_node = KnowledgeGraphNode(next_node_id, child_ast_node) + next_node_id += 1 + + tree_sitter_nodes.append(kg_child_ast_node) + tree_sitter_edges.append( + KnowledgeGraphEdge( + kg_node, kg_child_ast_node, KnowledgeGraphEdgeType.parent_of + ) + ) + + node_stack.append((tree_sitter_child_node, kg_child_ast_node, depth + 1)) + return next_node_id, tree_sitter_nodes, tree_sitter_edges + + def _markdown_file_graph( + self, parent_node: KnowledgeGraphNode, file: Path, next_node_id: int + ) -> Tuple[int, Sequence[KnowledgeGraphNode], Sequence[KnowledgeGraphEdge]]: + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ] + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on + ) + text = file.open(encoding="utf-8").read() + documents = markdown_splitter.split_text(text) + return self._documents_to_file_graph(documents, parent_node, next_node_id) + + def _documents_to_file_graph( + self, + documents: Sequence[Document], + parent_node: KnowledgeGraphNode, + next_node_id: int, + ) -> Tuple[int, Sequence[KnowledgeGraphNode], Sequence[KnowledgeGraphEdge]]: + document_nodes = [] + document_edges = [] + + previous_node = None + for document in documents: + text_node = TextNode( + text=document.page_content, + metadata=str(document.metadata) if document.metadata else "", + ) + kg_text_node = KnowledgeGraphNode(next_node_id, text_node) + next_node_id += 1 + document_nodes.append(kg_text_node) + document_edges.append( + KnowledgeGraphEdge(parent_node, kg_text_node, KnowledgeGraphEdgeType.has_text) + ) + + if previous_node: + document_edges.append( + KnowledgeGraphEdge( + previous_node, kg_text_node, KnowledgeGraphEdgeType.next_chunk + ) + ) + + previous_node = kg_text_node + return next_node_id, document_nodes, document_edges diff --git a/prometheus/knowledge_graph/types.py b/prometheus/graph/graph_types.py similarity index 95% rename from prometheus/knowledge_graph/types.py rename to prometheus/graph/graph_types.py index b3ef25d5..b60bed80 100644 --- a/prometheus/knowledge_graph/types.py +++ b/prometheus/graph/graph_types.py @@ -1,6 +1,5 @@ """Type definition for nodes and edges in the knowledge graph.""" - import dataclasses import enum from typing import TypedDict, Union @@ -42,9 +41,11 @@ class TextNode: Attributes: text: A string. + metadata: The metadata about the string. """ text: str + metadata: str @dataclasses.dataclass @@ -77,7 +78,9 @@ def to_neo4j_node(self) -> Union["Neo4jFileNode", "Neo4jASTNode", "Neo4jTextNode text=self.node.text, ) case TextNode(): - return Neo4jTextNode(node_id=self.node_id, text=self.node.text) + return Neo4jTextNode( + node_id=self.node_id, text=self.node.text, metadata=self.node.metadata + ) case _: raise ValueError("Unknown KnowledgeGraphNode.node type") @@ -163,6 +166,7 @@ class Neo4jASTNode(TypedDict): class Neo4jTextNode(TypedDict): node_id: int text: str + metadata: str class Neo4jHasFileEdge(TypedDict): diff --git a/prometheus/graph/knowledge_graph.py b/prometheus/graph/knowledge_graph.py new file mode 100644 index 00000000..df612e76 --- /dev/null +++ b/prometheus/graph/knowledge_graph.py @@ -0,0 +1,66 @@ +from collections import deque +import logging +from pathlib import Path + +from prometheus.graph.file_graph_builder import FileGraphBuilder +from prometheus.graph.graph_types import ( + FileNode, + KnowledgeGraphEdge, + KnowledgeGraphEdgeType, + KnowledgeGraphNode, +) + + +class KnowledgeGraph: + def __init__(self, root_dir: Path, max_ast_depth: int): + self.max_ast_depth = max_ast_depth + self._next_node_id = 0 + self._root_node = None + self._knowledge_graph_nodes = [] + self._knowledge_graph_edges = [] + self._file_graph_builder = FileGraphBuilder(max_ast_depth) + self._logger = logging.getLogger("prometheus.graph.knowledge_graph") + + self._build_graph(root_dir) + + def _build_graph(self, root_dir: Path): + root_dir_node = FileNode(basename=root_dir.name, relative_path=".") + kg_root_dir_node = KnowledgeGraphNode(self._next_node_id, root_dir_node) + self._next_node_id += 1 + self._knowledge_graph_nodes.append(kg_root_dir_node) + self._root_node = kg_root_dir_node + + file_stack = deque() + file_stack.append((root_dir, kg_root_dir_node)) + + while file_stack: + file, kg_file_path_node = file_stack.pop() + + if file.is_dir(): + for child_file in sorted(file.iterdir()): + child_file_node = FileNode( + basename=child_file.name, + relative_path=str(child_file.relative_to(root_dir)), + ) + kg_child_file_node = KnowledgeGraphNode(self._next_node_id, child_file_node) + self._next_node_id += 1 + self._knowledge_graph_nodes.append(kg_child_file_node) + self._knowledge_graph_edges.append( + KnowledgeGraphEdge( + kg_file_path_node, kg_child_file_node, KnowledgeGraphEdgeType.has_file + ) + ) + + file_stack.append((child_file, kg_child_file_node)) + continue + + if self._file_graph_builder.supports_file(file): + next_node_id, kg_nodes, kg_edges = self._file_graph_builder.build_file_graph( + kg_file_path_node, file, self._next_node_id + ) + self._next_node_id = next_node_id + self._knowledge_graph_nodes.extend(kg_nodes) + self._knowledge_graph_edges.extend(kg_edges) + continue + + self._logger.info(f"Skip parsing {file} because it is not supported.") diff --git a/tests/knowledge_graph/__init__.py b/prometheus/parser/__init__.py similarity index 100% rename from tests/knowledge_graph/__init__.py rename to prometheus/parser/__init__.py diff --git a/prometheus/parser/file_types.py b/prometheus/parser/file_types.py new file mode 100644 index 00000000..c42a9665 --- /dev/null +++ b/prometheus/parser/file_types.py @@ -0,0 +1,50 @@ +import enum +from pathlib import Path + + +class FileType(enum.StrEnum): + """Enum of all tree-sitter supported file types""" + + BASH = "bash" + C = "c" + CSHARP = "csharp" + CPP = "cpp" + GO = "go" + JAVA = "java" + JAVASCRIPT = "javascript" + KOTLIN = "kotlin" + PHP = "php" + PYTHON = "python" + SQL = "sql" + YAML = "yaml" + UNKNOWN = "UNKNOWN" + + @classmethod + def from_path(cls, path: Path): + match path.suffix: + case ".sh": + return cls.BASH + case ".c": + return cls.C + case ".cs": + return cls.CSHARP + case ".cpp" | ".cc" | ".cxx": + return cls.CPP + case ".go": + return cls.GO + case ".java": + return cls.JAVA + case ".js": + return cls.JAVASCRIPT + case ".kt": + return cls.KOTLIN + case ".php": + return cls.PHP + case ".py": + return cls.PYTHON + case ".sql": + return cls.SQL + case ".yaml" | ".yml": + return cls.YAML + case _: + return cls.UNKNOWN diff --git a/prometheus/parser/tree_sitter_parser.py b/prometheus/parser/tree_sitter_parser.py new file mode 100644 index 00000000..3f3940a8 --- /dev/null +++ b/prometheus/parser/tree_sitter_parser.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from tree_sitter._binding import Tree +from tree_sitter_languages import get_parser + +from prometheus.parser.file_types import FileType + + +class FileNotSupportedError(Exception): + pass + + +FILE_TYPE_TO_LANG = { + FileType.BASH: "bash", + FileType.C: "c", + FileType.CSHARP: "c_sharp", + FileType.CPP: "cpp", + FileType.GO: "go", + FileType.JAVA: "java", + FileType.JAVASCRIPT: "javascript", + FileType.KOTLIN: "kotlin", + FileType.PHP: "php", + FileType.PYTHON: "python", + FileType.SQL: "sql", + FileType.YAML: "yaml", +} + + +def supports_file(file: Path) -> bool: + file_type = FileType.from_path(file) + return file_type in FILE_TYPE_TO_LANG + + +def parse(file: Path) -> Tree: + file_type = FileType.from_path(file) + lang = FILE_TYPE_TO_LANG.get(file_type, None) + if lang is None: + raise FileNotSupportedError( + f"{file_type.value} is not supported by tree_sitter_parser" + ) + + lang_parser = get_parser(lang) + with file.open("rb") as f: + return lang_parser.parse(f.read()) diff --git a/pyproject.toml b/pyproject.toml index 7354aa06..f0b31d20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,11 @@ build-backend = "hatchling.build" [project] name = "Prometheus" version = "0.0.1" +dependencies = [ + "langchain==0.3.3", + "tree-sitter==0.21.3", + "tree-sitter-languages==1.10.2", +] requires-python = ">= 3.11" [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2fe5b588..00000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyink -isort \ No newline at end of file diff --git a/tests/graph/__init__.py b/tests/graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/graph/test_file_graph_builder.py b/tests/graph/test_file_graph_builder.py new file mode 100644 index 00000000..75e11a99 --- /dev/null +++ b/tests/graph/test_file_graph_builder.py @@ -0,0 +1,107 @@ +from prometheus.graph.file_graph_builder import FileGraphBuilder +from prometheus.graph.graph_types import ( + ASTNode, + KnowledgeGraphEdgeType, + KnowledgeGraphNode, + TextNode, +) +from tests.test_utils import test_project_paths + + +def test_supports_file(): + file_graph_builder = FileGraphBuilder(0) + + assert file_graph_builder.supports_file(test_project_paths.C_FILE) + assert file_graph_builder.supports_file(test_project_paths.JAVA_FILE) + assert file_graph_builder.supports_file(test_project_paths.MD_FILE) + assert file_graph_builder.supports_file(test_project_paths.PYTHON_FILE) + + assert file_graph_builder.supports_file(test_project_paths.DUMMY_FILE) is False + + +def test_build_python_file_graph(): + file_graph_builder = FileGraphBuilder(1000) + + parent_kg_node = KnowledgeGraphNode(0, None) + next_node_id, kg_nodes, kg_edges = file_graph_builder.build_file_graph( + parent_kg_node, test_project_paths.PYTHON_FILE, 0 + ) + + assert next_node_id == 11 + assert len(kg_nodes) == 11 + assert len(kg_edges) == 11 + + # Test if some of the nodes exists + argument_list_ast_node = ASTNode( + type="argument_list", start_line=0, end_line=0, text='("Hello world!")' + ) + string_ast_node = ASTNode( + type="string", start_line=0, end_line=0, text='"Hello world!"' + ) + + found_argument_list_ast_node = False + for kg_node in kg_nodes: + if kg_node.node == argument_list_ast_node: + found_argument_list_ast_node = True + assert found_argument_list_ast_node + + found_string_ast_node = False + for kg_node in kg_nodes: + if kg_node.node == string_ast_node: + found_string_ast_node = True + assert found_string_ast_node + + # Test if some of the edges exists + found_edge = False + for kg_edge in kg_edges: + if ( + kg_edge.source.node == argument_list_ast_node + and kg_edge.target.node == string_ast_node + and kg_edge.type == KnowledgeGraphEdgeType.parent_of + ): + found_edge = True + assert found_edge + + +def test_build_markdown_file_graph(): + file_graph_builder = FileGraphBuilder(1000) + + parent_kg_node = KnowledgeGraphNode(0, None) + next_node_id, kg_nodes, kg_edges = file_graph_builder.build_file_graph( + parent_kg_node, test_project_paths.MD_FILE, 0 + ) + + assert next_node_id == 4 + assert len(kg_nodes) == 4 + assert len(kg_edges) == 7 + + # Test if some of the nodes exists + header_b_text_node = TextNode( + text="Text under header B.", metadata="{'Header 1': 'A', 'Header 2': 'B'}" + ) + header_c_text_node = TextNode( + text="Text under header C.", metadata="{'Header 1': 'A', 'Header 2': 'C'}" + ) + + found_header_b_text_node = False + for kg_node in kg_nodes: + if kg_node.node == header_b_text_node: + found_header_b_text_node = True + assert found_header_b_text_node + + found_header_c_text_node = False + for kg_node in kg_nodes: + if kg_node.node == header_c_text_node: + found_header_c_text_node = True + assert found_header_c_text_node + + # Test if some of the edges exists + found_edge = False + for kg_edge in kg_edges: + if ( + kg_edge.source.node == header_b_text_node + and kg_edge.target.node == header_c_text_node + and kg_edge.type == KnowledgeGraphEdgeType.next_chunk + ): + found_edge = True + assert found_edge diff --git a/tests/knowledge_graph/test_types.py b/tests/graph/test_graph_types.py similarity index 93% rename from tests/knowledge_graph/test_types.py rename to tests/graph/test_graph_types.py index 6a3d1be8..a7059ea9 100644 --- a/tests/knowledge_graph/test_types.py +++ b/tests/graph/test_graph_types.py @@ -1,4 +1,4 @@ -from prometheus.knowledge_graph.types import ( +from prometheus.graph.graph_types import ( ASTNode, FileNode, KnowledgeGraphEdge, @@ -56,9 +56,10 @@ def test_to_neo4j_ast_node(): def test_to_neo4j_text_node(): text = "Hello world" + metadata = "metadata" node_id = 1 - text_node = TextNode(text) + text_node = TextNode(text, metadata) knowldege_graph_node = KnowledgeGraphNode(node_id, text_node) neo4j_text_node = knowldege_graph_node.to_neo4j_node() @@ -66,9 +67,11 @@ def test_to_neo4j_text_node(): assert "node_id" in neo4j_text_node assert "text" in neo4j_text_node + assert "metadata" in neo4j_text_node assert neo4j_text_node["node_id"] == node_id assert neo4j_text_node["text"] == text + assert neo4j_text_node["metadata"] == metadata def test_to_neo4j_has_file_edge(): @@ -172,11 +175,12 @@ def test_to_neo4j_has_text_edge(): source_relative_path = "foo/bar/source.py" source_node_id = 1 target_text = "Hello world" + target_metadata = "metadata" target_node_id = 10 source_file_node = FileNode(source_basename, source_relative_path) source_knowledge_graph_node = KnowledgeGraphNode(source_node_id, source_file_node) - target_text_node = TextNode(target_text) + target_text_node = TextNode(target_text, target_metadata) target_knowledge_graph_node = KnowledgeGraphNode(target_node_id, target_text_node) knowledge_graph_edge = KnowledgeGraphEdge( source_knowledge_graph_node, @@ -196,13 +200,15 @@ def test_to_neo4j_has_text_edge(): def test_to_neo4j_next_chunk_edge(): source_text = "Hello" + source_metadata = "meta" source_node_id = 1 target_text = "world" + target_metadata = "data" target_node_id = 10 - source_text_node = TextNode(source_text) + source_text_node = TextNode(source_text, source_metadata) source_knowledge_graph_node = KnowledgeGraphNode(source_node_id, source_text_node) - target_text_node = TextNode(target_text) + target_text_node = TextNode(target_text, target_metadata) target_knowledge_graph_node = KnowledgeGraphNode(target_node_id, target_text_node) knowledge_graph_edge = KnowledgeGraphEdge( source_knowledge_graph_node, diff --git a/tests/graph/test_knowledge_graph.py b/tests/graph/test_knowledge_graph.py new file mode 100644 index 00000000..0f010682 --- /dev/null +++ b/tests/graph/test_knowledge_graph.py @@ -0,0 +1,75 @@ +from prometheus.graph.graph_types import ( + ASTNode, + FileNode, + KnowledgeGraphEdgeType, + TextNode, +) +from prometheus.graph.knowledge_graph import KnowledgeGraph +from tests.test_utils import test_project_paths + + +def test_build_graph(): + knowledge_graph = KnowledgeGraph(test_project_paths.TEST_PROJECT_PATH, 1000) + + assert knowledge_graph._next_node_id == 97 + # 8 FileNode + # 85 ASTnode + # 4 TextNode + assert len(knowledge_graph._knowledge_graph_nodes) == 97 + assert len(knowledge_graph._knowledge_graph_edges) == 99 + + file_nodes = [ + kg_node + for kg_node in knowledge_graph._knowledge_graph_nodes + if isinstance(kg_node.node, FileNode) + ] + assert len(file_nodes) == 8 + + ast_nodes = [ + kg_node + for kg_node in knowledge_graph._knowledge_graph_nodes + if isinstance(kg_node.node, ASTNode) + ] + assert len(ast_nodes) == 85 + + text_nodes = [ + kg_node + for kg_node in knowledge_graph._knowledge_graph_nodes + if isinstance(kg_node.node, TextNode) + ] + assert len(text_nodes) == 4 + + parent_of_edges = [ + kg_edge + for kg_edge in knowledge_graph._knowledge_graph_edges + if kg_edge.type == KnowledgeGraphEdgeType.parent_of + ] + assert len(parent_of_edges) == 82 + + has_file_edges = [ + kg_edge + for kg_edge in knowledge_graph._knowledge_graph_edges + if kg_edge.type == KnowledgeGraphEdgeType.has_file + ] + assert len(has_file_edges) == 7 + + has_ast_edges = [ + kg_edge + for kg_edge in knowledge_graph._knowledge_graph_edges + if kg_edge.type == KnowledgeGraphEdgeType.has_ast + ] + assert len(has_ast_edges) == 3 + + has_text_edges = [ + kg_edge + for kg_edge in knowledge_graph._knowledge_graph_edges + if kg_edge.type == KnowledgeGraphEdgeType.has_text + ] + assert len(has_text_edges) == 4 + + next_chunk_edges = [ + kg_edge + for kg_edge in knowledge_graph._knowledge_graph_edges + if kg_edge.type == KnowledgeGraphEdgeType.next_chunk + ] + assert len(next_chunk_edges) == 3 diff --git a/tests/test_project/bar/test.java b/tests/test_project/bar/test.java new file mode 100644 index 00000000..5102f049 --- /dev/null +++ b/tests/test_project/bar/test.java @@ -0,0 +1,5 @@ +public class test { + public static void main(String[] args) { + System.out.println("Hello world!"); + } +} \ No newline at end of file diff --git a/tests/test_project/bar/test.py b/tests/test_project/bar/test.py new file mode 100644 index 00000000..f1a18139 --- /dev/null +++ b/tests/test_project/bar/test.py @@ -0,0 +1 @@ +print("Hello world!") diff --git a/tests/test_project/foo/test.dummy b/tests/test_project/foo/test.dummy new file mode 100644 index 00000000..ebbf1603 --- /dev/null +++ b/tests/test_project/foo/test.dummy @@ -0,0 +1,11 @@ +@program Start + +use library console_output + +start { + declare text message = "Hello world!"; + print_to_console(message); + halt; +} + +end @program \ No newline at end of file diff --git a/tests/test_project/foo/test.md b/tests/test_project/foo/test.md new file mode 100644 index 00000000..2e3bc6cd --- /dev/null +++ b/tests/test_project/foo/test.md @@ -0,0 +1,15 @@ +# A + +Text under header A. + +## B + +Text under header B. + +## C + +Text under header C. + +### D + +Text under header D. \ No newline at end of file diff --git a/tests/test_project/test.c b/tests/test_project/test.c new file mode 100644 index 00000000..822faf74 --- /dev/null +++ b/tests/test_project/test.c @@ -0,0 +1,6 @@ +#include + +int main() { + printf("Hello world!\n"); + return 0; +} \ No newline at end of file diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_utils/test_project_paths.py b/tests/test_utils/test_project_paths.py new file mode 100644 index 00000000..4f5e090f --- /dev/null +++ b/tests/test_utils/test_project_paths.py @@ -0,0 +1,11 @@ +from pathlib import Path + + +TEST_PROJECT_PATH = Path(__file__).parent.parent / "test_project" +C_FILE = TEST_PROJECT_PATH / "test.c" +BAR_DIR = TEST_PROJECT_PATH / "bar" +JAVA_FILE = BAR_DIR / "test.java" +PYTHON_FILE = BAR_DIR / "test.py" +FOO_DIR = TEST_PROJECT_PATH / "foo" +MD_FILE = FOO_DIR / "test.md" +DUMMY_FILE = FOO_DIR / "test.dummy"