# Scopium : Expanding the scope of your codebase

### Features of Scopium:
- Automatic chunking and storing of the entire codebase based on relationship between files(imports, directory levels, symbol types)  
- Efficient retrieving system performed with a hybrid approach in mind

### Dataset - The codebase needed by the user
- When the root directory of the codebase is given as the argument, it converts it to a networkx graph capturing all the mentioned relationship between codes. 
- This is then loaded to an arangoDB

### Step0 - Installs and imports

In [150]:
!pip install nx-arangodb
!pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com # Requires CUDA-capable GPU
!pip install --upgrade langchain langchain-community langchain-openai langgraph langchain_mistralai
!pip install networkx==3.4

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting nx-cugraph-cu12
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl (160 kB)
INFO: pip is looking at multiple versions of nx-cugraph-cu12 to determine which version is compatible with other requirements. This could take a while.
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.12.0-py3-none-any.whl (152 kB)
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl (149 kB)
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.8.0-py3-none-any.whl (140 kB)
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.6.1-py3-none-any.whl (129 kB)
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.6.0-py3-none-any.whl (129 kB)
  Downloading https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.4.0-py3-none-any.whl (125 kB)
  Downloading https://pypi.nv

#### Imports:

In [151]:
import networkx as nx
import matplotlib.pyplot as plt
import random
import os
import ast
from typing import Dict, Set, List, Tuple, Optional,Any
import json
from arango import ArangoClient
import nx_arangodb as nxadb
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool
from langchain_mistralai import ChatMistralAI
import glob
import re

#### Building the graph - 
- This code takes the root directory of the codebase as the input. 
- It then builds the graph based on the mentioned features and storing it in appropriate nodes and edges.

In [None]:
# Define global data structures
def initialize_data_structures():
    data = {
        'root_dir': '',
        'graph': nx.DiGraph(),
        'file_contents': {},  # file -> content
        'import_relations': {},  # file -> [(module, line_no)]
        'module_symbols': {},  # file -> {symbol -> {type, line_no, context}}
        'symbol_references': {},  # symbol -> [(file, line_no, context)]
        'file_index': {},  # Maps files to indices
        'current_index': 0,
        'directories': set(),
        'symbol_index': {},  # symbol -> [{file, type, line_no, context}]
        'supported_languages': ["python", "cpp", "java", "go"],
        'language_extensions': {
            "python": [".py"],
            "cpp": [".c", ".cpp", ".h", ".hpp", ".cc", ".cxx", ".hxx"],
            "java": [".java"],
            "go": [".go"]
        }
    }
    return data

#### Core utility functions for code processing
- Functions for indexing, chunking, context extraction, and language detection

In [None]:
def get_next_index(data):
    """Get next available index for file indexing."""
    data['current_index'] += 1
    return data['current_index']

def chunk_code(code, lines_per_chunk=20):
    """
    Chunk the given code into snippets.
    Returns a list of dictionaries with 'code_snippet', 'start_line', and 'end_line'.
    """
    lines = code.splitlines()
    chunks = []
    for i in range(0, len(lines), lines_per_chunk):
        chunk_lines = lines[i:i + lines_per_chunk]
        chunk = {
            'code_snippet': '\n'.join(chunk_lines),
            'start_line': i + 1,
            'end_line': i + len(chunk_lines)
        }
        chunks.append(chunk)
    return chunks

def get_context_around_line(data, file_path, line_no, context_lines=3):
    """Extract context around a specific line in a file."""
    if file_path not in data['file_contents']:
        return ""
    
    lines = data['file_contents'][file_path].splitlines()
    start = max(0, line_no - context_lines - 1)
    end = min(len(lines), line_no + context_lines)
    
    context = "\n".join(lines[start:end])
    return context

def detect_language(data, file_path):
    """Detect the programming language of a file based on its extension."""
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    for language, extensions in data['language_extensions'].items():
        if ext in extensions:
            return language
            
    return "unknown"

#### Python AST analysis functions
- Extract source code from Python AST nodes
- Analyze Python files for imports and symbol definitions

In [None]:
def extract_python_node_source(source, node):
    """Extract the source code for a Python AST node."""
    try:
        lines = source.splitlines()
        if hasattr(node, 'lineno') and hasattr(node, 'end_lineno'):
            start = node.lineno - 1
            end = getattr(node, 'end_lineno', start + 1)
            return '\n'.join(lines[start:end])
        return ""
    except Exception:
        return ""

def analyze_python_file(data, file_path, content):
    """Analyze a Python file for imports and symbols."""
    try:
        tree = ast.parse(content)
        imports = []
        symbols = {}

        for node in ast.walk(tree):
            # Track imports
            if isinstance(node, (ast.Import, ast.ImportFrom)):
                if isinstance(node, ast.Import):
                    for name in node.names:
                        imports.append((name.name, node.lineno))
                else:  # ImportFrom
                    module = node.module if node.module else ''
                    for name in node.names:
                        imports.append((f"{module}.{name.name}" if module else name.name, node.lineno))

            # Track defined symbols with line numbers and context
            elif isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Assign)):
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    symbol_name = node.name
                    symbol_type = 'class' if isinstance(node, ast.ClassDef) else 'function'
                    line_no = node.lineno
                    context = extract_python_node_source(content, node)
                    
                    symbols[symbol_name] = {
                        'type': symbol_type,
                        'line_no': line_no,
                        'context': context,
                        'docstring': ast.get_docstring(node)
                    }
                elif isinstance(node, ast.Assign):
                    # Handle variable assignments
                    for target in node.targets:
                        if isinstance(target, ast.Name):
                            symbol_name = target.id
                            line_no = node.lineno
                            context = extract_python_node_source(content, node)
                            
                            symbols[symbol_name] = {
                                'type': 'variable',
                                'line_no': line_no,
                                'context': context
                            }

        data['import_relations'][file_path] = imports
        data['module_symbols'][file_path] = symbols

    except Exception as e:
        print(f"Error analyzing Python file {file_path}: {e}")

#### Language-specific code analyzers
- Extracts imports, symbols, and structure from C++, Java, and Go files
- Uses regex patterns to identify language constructs like classes, functions, and namespaces

In [None]:
def analyze_cpp_file(data, file_path, content):
    """Analyze a C/C++ file for includes and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for C/C++ code analysis
    include_pattern = re.compile(r'#include\s+[<"]([^>"]+)[>"]')
    class_pattern = re.compile(r'(?:class|struct)\s+(\w+)')
    function_pattern = re.compile(r'(\w+)\s*\([^)]*\)\s*(?:const|override|final|noexcept)?\s*(?:{|;)')
    namespace_pattern = re.compile(r'namespace\s+(\w+)')
    
    for line_no, line in enumerate(lines, 1):
        # Find include statements
        include_match = include_pattern.search(line)
        if include_match:
            imports.append((include_match.group(1), line_no))
        
        # Find class/struct definitions
        class_match = class_pattern.search(line)
        if class_match:
            class_name = class_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[class_name] = {
                'type': 'class',
                'line_no': line_no,
                'context': context
            }
        
        # Find function definitions (simplified)
        function_match = function_pattern.search(line)
        if function_match and not line.strip().startswith('#') and not line.strip().startswith('//'):
            function_name = function_match.group(1)
            # Skip some common keywords that might be mistaken for functions
            if function_name not in ['if', 'while', 'for', 'switch', 'return']:
                context = get_context_around_line(data, file_path, line_no)
                symbols[function_name] = {
                    'type': 'function',
                    'line_no': line_no,
                    'context': context
                }
        
        # Find namespace definitions
        namespace_match = namespace_pattern.search(line)
        if namespace_match:
            namespace_name = namespace_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[namespace_name] = {
                'type': 'namespace',
                'line_no': line_no,
                'context': context
            }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

def analyze_java_file(data, file_path, content):
    """Analyze a Java file for imports and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for Java code analysis
    package_pattern = re.compile(r'package\s+([\w.]+)')
    import_pattern = re.compile(r'import\s+([\w.]+(?:\.\*)?)')
    class_pattern = re.compile(r'(?:public|private|protected)?\s*(?:abstract|final)?\s*class\s+(\w+)')
    interface_pattern = re.compile(r'(?:public|private|protected)?\s*interface\s+(\w+)')
    method_pattern = re.compile(r'(?:public|private|protected)?\s*(?:static|final|abstract)?\s*(?:[\w<>[\],\s]+)\s+(\w+)\s*\([^)]*\)')
    
    for line_no, line in enumerate(lines, 1):
        # Find package declaration
        package_match = package_pattern.search(line)
        if package_match:
            package_name = package_match.group(1)
            imports.append((package_name, line_no))
        
        # Find import statements
        import_match = import_pattern.search(line)
        if import_match:
            import_name = import_match.group(1)
            imports.append((import_name, line_no))
        
        # Find class definitions
        class_match = class_pattern.search(line)
        if class_match:
            class_name = class_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[class_name] = {
                'type': 'class',
                'line_no': line_no,
                'context': context
            }
        
        # Find interface definitions
        interface_match = interface_pattern.search(line)
        if interface_match:
            interface_name = interface_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[interface_name] = {
                'type': 'interface',
                'line_no': line_no,
                'context': context
            }
        
        # Find method definitions
        method_match = method_pattern.search(line)
        if method_match:
            method_name = method_match.group(1)
            # Skip some common keywords that might be mistaken for methods
            if method_name not in ['if', 'while', 'for', 'switch', 'return']:
                context = get_context_around_line(data, file_path, line_no)
                symbols[method_name] = {
                    'type': 'method',
                    'line_no': line_no,
                    'context': context
                }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

def analyze_go_file(data, file_path, content):
    """Analyze a Go file for imports and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for Go code analysis
    package_pattern = re.compile(r'package\s+(\w+)')
    import_single_pattern = re.compile(r'import\s+"([^"]+)"')
    import_multi_start_pattern = re.compile(r'import\s+\(')
    import_multi_line_pattern = re.compile(r'\s*"([^"]+)"')
    func_pattern = re.compile(r'func\s+(?:\([^)]+\)\s+)?(\w+)')
    struct_pattern = re.compile(r'type\s+(\w+)\s+struct')
    interface_pattern = re.compile(r'type\s+(\w+)\s+interface')
    
    in_import_block = False
    
    for line_no, line in enumerate(lines, 1):
        # Find package declaration
        package_match = package_pattern.search(line)
        if package_match:
            package_name = package_match.group(1)
            imports.append((f"package {package_name}", line_no))
        
        # Handle single-line imports
        import_match = import_single_pattern.search(line)
        if import_match:
            import_name = import_match.group(1)
            imports.append((import_name, line_no))
        
        # Handle multi-line imports
        if import_multi_start_pattern.search(line):
            in_import_block = True
            continue
        
        if in_import_block:
            if line.strip() == ')':
                in_import_block = False
                continue
                
            import_line_match = import_multi_line_pattern.search(line)
            if import_line_match:
                import_name = import_line_match.group(1)
                imports.append((import_name, line_no))
        
        # Find function definitions
        func_match = func_pattern.search(line)
        if func_match:
            func_name = func_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[func_name] = {
                'type': 'function',
                'line_no': line_no,
                'context': context
            }
        
        # Find struct definitions
        struct_match = struct_pattern.search(line)
        if struct_match:
            struct_name = struct_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[struct_name] = {
                'type': 'struct',
                'line_no': line_no,
                'context': context
            }
        
        # Find interface definitions
        interface_match = interface_pattern.search(line)
        if interface_match:
            interface_name = interface_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[interface_name] = {
                'type': 'interface',
                'line_no': line_no,
                'context': context
            }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

#### Analysize the file
- Calls the respective language function based on the codebase

In [None]:
def analyze_file(data, file_path, content, language):
    """Analyze a file for imports and symbols with line numbers and context."""
    if language == "python":
        analyze_python_file(data, file_path, content)
    elif language == "cpp":
        analyze_cpp_file(data, file_path, content)
    elif language == "java":
        analyze_java_file(data, file_path, content)
    elif language == "go":
        analyze_go_file(data, file_path, content)

#### Symbol reference analyzers
- Identifies variable and function references across Python, Go, C++, and Java files
- Uses AST parsing for Python and regex pattern matching for other languages
- Tracks references with file location and surrounding context for better understanding

In [None]:
def find_references_in_python_file(data, file_path, content):
    """Find references to symbols in a Python file."""
    try:
        tree = ast.parse(content)
        
        for node in ast.walk(tree):
            # Find variable references
            if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
                symbol_name = node.id
                line_no = node.lineno
                
                # Track reference with context
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))
            
            # Find attribute references (e.g., obj.method())
            elif isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
                attr_name = node.attr
                line_no = node.lineno
                
                if attr_name not in data['symbol_references']:
                    data['symbol_references'][attr_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][attr_name].append((file_path, line_no, context))
    
    except Exception as e:
        print(f"Error finding references in Python file {file_path}: {e}")

def find_references_in_go_file(data, file_path, content):
    """Find references to symbols in a Go file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Only create patterns for symbols with reasonable length (avoid single-character symbols)
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines and import/package declarations
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("import ") or 
            line.strip().startswith("package ")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))


def find_references_in_cpp_file(data, file_path, content):
    """Find references to symbols in a C/C++ file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns for symbols with meaningful length
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Skip very short symbols that would cause many false positives
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines and preprocessor directives
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("#")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))


def find_references_in_java_file(data, file_path, content):
    """Find references to symbols in a Java file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns for symbols with meaningful length
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Skip very short symbols that would cause many false positives
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines, imports, and package declarations
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("import ") or 
            line.strip().startswith("package ")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))

def find_references_in_file(data, file_path, content, language):
    """Find references to symbols in a file based on its language."""
    if language == "python":
        find_references_in_python_file(data, file_path, content)
    elif language == "cpp":
        find_references_in_cpp_file(data, file_path, content)
    elif language == "java":
        find_references_in_java_file(data, file_path, content)
    elif language == "go":
        find_references_in_go_file(data, file_path, content)

#### Graph building and indexing
- Constructs a comprehensive symbol index of all definitions and references
- Parses all files in the codebase to build directory and file relationships
- Creates a graph structure with nodes for files/directories and edges for relationships

In [None]:
def build_symbol_index(data):
    """Build a comprehensive index of all symbols and where they're defined/used."""
    # Initialize the symbol index
    data['symbol_index'] = {}
    
    # First, add all symbol definitions
    for file_path, symbols in data['module_symbols'].items():
        for symbol_name, details in symbols.items():
            if symbol_name not in data['symbol_index']:
                data['symbol_index'][symbol_name] = []
            
            data['symbol_index'][symbol_name].append({
                'file': file_path,
                'type': 'definition',
                'symbol_type': details['type'],
                'line_no': details['line_no'],
                'context': details.get('context', ''),
                'docstring': details.get('docstring', '')
            })
    
    # Then, add all references
    for symbol_name, references in data['symbol_references'].items():
        if symbol_name not in data['symbol_index']:
            data['symbol_index'][symbol_name] = []
        
        for file_path, line_no, context in references:
            # Avoid duplicating references if they're already in definitions
            if not any(ref['file'] == file_path and ref['line_no'] == line_no and ref['type'] == 'definition' 
                      for ref in data['symbol_index'].get(symbol_name, [])):
                data['symbol_index'][symbol_name].append({
                    'file': file_path,
                    'type': 'reference',
                    'line_no': line_no,
                    'context': context
                })

def parse_files(data):
    """Parse all files in the directory and build relationships."""
    # First pass: Index all files and create directory nodes
    for root, dirs, files in os.walk(data['root_dir']):
        # Add directory node
        rel_dir = os.path.relpath(root, data['root_dir'])
        if rel_dir != '.':
            data['directories'].add(rel_dir)
            data['graph'].add_node(rel_dir, type='directory')
            
            # Add edge from parent directory to this directory (if not root)
            parent_dir = os.path.dirname(rel_dir)
            if parent_dir and parent_dir != '.':
                data['graph'].add_edge(parent_dir, rel_dir, edge_type='contains_directory')

        # Index files of supported languages
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, data['root_dir'])
            file_language = detect_language(data, file_path)
            
            if file_language in data['supported_languages']:
                data['file_index'][rel_path] = get_next_index(data)
                
                # Add node for this file
                data['graph'].add_node(rel_path, type='file', file_index=data['file_index'][rel_path], language=file_language)
                
                # Connect file to its directory
                if rel_dir != '.':
                    data['graph'].add_edge(rel_dir, rel_path, edge_type='contains_file')
                
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        data['file_contents'][rel_path] = content
                        analyze_file(data, rel_path, content, file_language)
                except Exception as e:
                    print(f"Error parsing {file_path}: {e}")
    
    # Second pass: Find symbol references across files
    for file_path, content in data['file_contents'].items():
        file_language = detect_language(data, file_path)
        find_references_in_file(data, file_path, content, file_language)
    
    # Build the symbol index after all analyses
    build_symbol_index(data)

#### Graph construction
- Builds NetworkX graph with enhanced node and edge information
- Creates nodes for directories, files, code snippets, and symbols
- Establishes relationships between files (imports, references) with line numbers

In [None]:
def build_graph(data):
    """Build the NetworkX graph with enhanced node and edge information."""
    # We've already added basic file and directory nodes during parsing
    # Now add more detailed connections and data
    
    # Add nodes for all directories (if not already added)
    for directory in data['directories']:
        if not data['graph'].has_node(directory):
            data['graph'].add_node(directory, type='directory')
        
        # Ensure parent directories exist and are connected
        parts = directory.split(os.sep)
        for i in range(1, len(parts)):
            parent_path = os.sep.join(parts[:i])
            if parent_path and not data['graph'].has_node(parent_path):
                data['graph'].add_node(parent_path, type='directory')
                data['directories'].add(parent_path)
            
            # Connect parent to child directory
            if parent_path:
                child_path = os.sep.join(parts[:i+1])
                data['graph'].add_edge(parent_path, child_path, edge_type='contains_directory')
    
    # Add nodes for all files with indices and code snippet nodes
    for file_path, file_idx in data['file_index'].items():
        language = detect_language(data, file_path)
        
        # Update file node if it exists, create it otherwise
        if data['graph'].has_node(file_path):
            data['graph'].nodes[file_path].update({
                'file_index': file_idx,
                'directory': os.path.dirname(file_path),
                'language': language
            })
        else:
            data['graph'].add_node(file_path, 
                               type='file',
                               file_index=file_idx,
                               directory=os.path.dirname(file_path),
                               language=language)
        
        # Connect file to its directory
        directory = os.path.dirname(file_path)
        if directory:
            # Make sure the directory node exists
            if not data['graph'].has_node(directory):
                data['graph'].add_node(directory, type='directory')
                data['directories'].add(directory)
            
            # Add edge from directory to file if it doesn't exist
            if not data['graph'].has_edge(directory, file_path):
                data['graph'].add_edge(directory, file_path, edge_type='contains_file')
        
        # Create snippet nodes for the entire file
        if file_path in data['file_contents']:
            chunks = chunk_code(data['file_contents'][file_path])
            for idx, chunk_info in enumerate(chunks):
                snippet_node = f"{file_path}::snippet::{idx}"
                data['graph'].add_node(snippet_node,
                                   type='snippet',
                                   code_snippet=chunk_info['code_snippet'],
                                   start_line=chunk_info['start_line'],
                                   end_line=chunk_info['end_line'],
                                   language=language)
                # Connect file node to snippet node
                data['graph'].add_edge(file_path, snippet_node, 
                                   edge_type='contains_snippet',
                                   start_line=chunk_info['start_line'],
                                   end_line=chunk_info['end_line'])

        # Add nodes for symbols in this file
        for symbol, details in data['module_symbols'].get(file_path, {}).items():
            symbol_node = f"{file_path}::{symbol}"
            data['graph'].add_node(symbol_node, 
                               type='symbol',
                               symbol_type=details['type'],
                               line_number=details['line_no'],
                               context=details.get('context', ''),
                               docstring=details.get('docstring', ''))
            data['graph'].add_edge(file_path, symbol_node, 
                               edge_type='defines',
                               line_number=details['line_no'])

    # Add edges for imports with line numbers
    for file_path, imports in data['import_relations'].items():
        for imp, line_no in imports:
            # Look for matching files or symbols
            for target_file, symbols in data['module_symbols'].items():
                if imp in symbols:
                    data['graph'].add_edge(file_path, 
                                       f"{target_file}::{imp}",
                                       edge_type='import',
                                       line_number=line_no)
                # For Python, handle module imports
                elif detect_language(data, file_path) == "python" and target_file.replace('.py', '').endswith(imp):
                    data['graph'].add_edge(file_path, 
                                       target_file,
                                       edge_type='import',
                                       line_number=line_no)
                # For Java, handle package imports
                elif detect_language(data, file_path) == "java" and imp.startswith(os.path.splitext(os.path.basename(target_file))[0]):
                    data['graph'].add_edge(file_path, 
                                       target_file,
                                       edge_type='import',
                                       line_number=line_no)
    
    # Add edges for symbol references
    for symbol, references in data['symbol_references'].items():
        for file_path, line_no, context in references:
            # Find symbol nodes that match this reference
            for target_file, symbols in data['module_symbols'].items():
                if symbol in symbols:
                    # Create reference edge
                    data['graph'].add_edge(file_path, 
                                       f"{target_file}::{symbol}",
                                       edge_type='references',
                                       line_number=line_no,
                                       context=context)
    
    return data['graph']

#### Graph validation and statistics
- Validates graph for consistency between data structures and graph nodes/edges
- Generates comprehensive statistics on files, directories, symbols, and connections
- Identifies and reports issues like missing nodes or incomplete relationships

In [None]:
def validate_graph_and_data(data, G):
    """Validate the graph and data structures for consistency and coverage."""
    report = {
        'stats': {
            'files': len(data['file_index']),
            'directories': len(data['directories']),
            'symbols': len(data['symbol_index']),
            'nodes': len(G.nodes()),
            'edges': len(G.edges())
        },
        'issues': []
    }
    
    # Check that all files in file_index have corresponding nodes
    for file_path in data['file_index']:
        if not G.has_node(file_path):
            report['issues'].append(f"File {file_path} in index but missing from graph")
    
    # Check that all directories have nodes
    for directory in data['directories']:
        if not G.has_node(directory):
            report['issues'].append(f"Directory {directory} in data but missing from graph")
    
    # Check symbol connections
    for symbol, entries in data['symbol_index'].items():
        definition_files = [entry['file'] for entry in entries if entry['type'] == 'definition']
        for def_file in definition_files:
            symbol_node = f"{def_file}::{symbol}"
            if not G.has_node(symbol_node):
                report['issues'].append(f"Symbol {symbol} defined in {def_file} but node missing from graph")
    
    # Count symbols by type
    symbol_types = {}
    for entries in data['symbol_index'].values():
        for entry in entries:
            if entry['type'] == 'definition' and 'symbol_type' in entry:
                symbol_type = entry['symbol_type']
                if symbol_type not in symbol_types:
                    symbol_types[symbol_type] = 0
                symbol_types[symbol_type] += 1
    
    report['stats']['symbol_types'] = symbol_types
    
    # Count edge types
    edge_types = {}
    for _, _, attrs in G.edges(data=True):
        edge_type = attrs.get('edge_type', 'unknown')
        if edge_type not in edge_types:
            edge_types[edge_type] = 0
        edge_types[edge_type] += 1
    
    report['stats']['edge_types'] = edge_types
    
    return report

#### Main function and program execution
- Initializes data structures and processes codebase starting from root directory
- Parses files, builds graph representation, and validates resulting structures
- Outputs statistics about parsed files, graph nodes/edges, and validation results

In [76]:
def main_graph_builder(root_directory):
    # Initialize all data structures
    data = initialize_data_structures()
    data['root_dir'] = root_directory
    
    print(f"Starting to parse files in {root_directory}...")
    
    # Parse all the files in the directory
    parse_files(data)
    print(f"Parsed {len(data['file_index'])} files")
    
    # Build the graph representation
    G = build_graph(data)
    print(f"Graph has {len(G.nodes())} nodes and {len(G.edges())} edges")
    
    # Validate the graph and data
    report = validate_graph_and_data(data, G)
    print(json.dumps(report, indent=2))
    
    return data, G

In [None]:
root_dir = "flask"

data, G = main_graph_builder(root_dir)

### Make Database


#### Note
- Rename the .env.temp with your creditials to write into the database

In [110]:
from dotenv import load_dotenv

load_dotenv()

ARANGO_HOST = os.getenv("ARANGO_HOST")
ARANGO_USERNAME = os.getenv("ARANGO_USERNAME")
ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD")
ARANGO_VERIFY = os.getenv("ARANGO_VERIFY") == "True"
GRAPH_NAME = os.getenv("GRAPH_NAME")
WRITE_BATCH_SIZE = int(os.getenv("WRITE_BATCH_SIZE"))

In [None]:
# Connect to ArangoDB using the credentials from the environment variables
client = ArangoClient(hosts=ARANGO_HOST)
db = client.db(username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=ARANGO_VERIFY)
print("Database connection:", db)

In [None]:
# Create the nxadb Graph object with initial graph data
G_adb = nxadb.Graph(
    name=GRAPH_NAME,
    db=db,
    incoming_graph_data=G,
    write_batch_size=WRITE_BATCH_SIZE,
    overwrite_graph=True
)

print("Graph object with data:", G_adb)

### Run from loaded database

In [111]:
# Connect to ArangoDB using the credentials from the environment variables
client = ArangoClient(hosts=ARANGO_HOST)
db = client.db(username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=ARANGO_VERIFY)
print("Database connection:", db)

Database connection: <StandardDatabase _system>


In [113]:
G_adb_loaded = nxadb.Graph(
    name=GRAPH_NAME,
    db=db,
)
print("Graph object type:", type(G_adb_loaded))

[10:46:50 +0530] [INFO]: Graph 'FlaskRepv1' exists.
[10:46:51 +0530] [INFO]: Default node type set to 'FlaskRepv1_node'


Graph object type: <class 'nx_arangodb.classes.graph.Graph'>


# LLM chatbot for querying

In [115]:
import os
import traceback
from typing import Dict, Optional, List, Any
from arango import ArangoClient
from mistralai.client import MistralClient

#### Global variables and database connections
- Stores references to database connections, clients, and models
- Maintains collections for nodes, edges, and various indexing structures
- Includes mappings between files, snippets, symbols, and their relationships

In [116]:
db = None
client = None
mistral_client = None
model = None
graph_name = None
node_collection = None
edge_collection = None
files = {}
snippets = {}
symbols = {}
db_schema = {}
node_types = {}
symbol_name_index = {}
file_to_snippets = {}
file_to_symbols = {}
snippet_to_symbols = {}
conversation_history = []
type_field = None
path_field = None
edge_type_field = None

#### Graph structure discovery
- Dynamically identifies ArangoDB graph structure and collections
- Retrieves edge definitions and node collections from the database
- Falls back to default naming patterns if explicit structure not found

In [118]:
def discover_graph_structure():
    """Dynamically discover the graph structure in ArangoDB with improved directory detection"""
    global node_collection, edge_collection
    
    try:
        # Get graph object
        graph = db.graph(graph_name)
        graph_info = graph.properties()
        
        # Get the edge collection name from graph properties
        edge_definitions = graph_info.get('edgeDefinitions', [])
        
        # If no edge definitions exist, set defaults and retry
        if not edge_definitions:
            print(f"No edge definitions found, using default naming pattern")
            node_collection = f"{graph_name}_node"
            edge_collection = f"{graph_name}_node_to_{graph_name}_node"
            print(f"Using default collections: Nodes={node_collection}, Edges={edge_collection}")
            # Validate the schema to understand the field names
            validate_schema()
            return
        
        # Get the edge collection
        edge_def = edge_definitions[0]
        edge_collection = edge_def.get('collection')
        
        # Get node collection
        from_collections = edge_def.get('from', [])
        if not from_collections:
            # No 'from' collections, use defaults
            node_collection = f"{graph_name}_nodes"
            print(f"No 'from' collections found, using default node collection: {node_collection}")
        else:
            node_collection = from_collections[0]
        
        print(f"Using collections: Nodes={node_collection}, Edges={edge_collection}")
        
        # Validate the schema to understand the field names
        validate_schema()
    except Exception as e:
        print(f"Error discovering graph structure: {str(e)}")
        traceback.print_exc()
        raise


#### Schema Validation Function
- Identifies key schema fields (type, path, edge_type) by sampling nodes and edges
- Searches through candidate field names to find the actual fields used in the database
- Handles errors and provides validation feedback through console output

In [119]:
def validate_schema():
    """Validate the schema and identify the key field names used in this database"""
    global type_field, path_field, edge_type_field
    
    try:
        # Sample nodes to understand the schema
        aql = f"""
        FOR v IN {node_collection}
        LIMIT 10
        RETURN v
        """
        cursor = db.aql.execute(aql)
        sample_nodes = [doc for doc in cursor]
        
        if not sample_nodes:
            raise ValueError(f"No nodes found in collection {node_collection}")
        
        # Identify the type field
        type_field_candidates = ['type', 'ast_type', 'node_type']
        type_field = None
        
        for field in type_field_candidates:
            for node in sample_nodes:
                if field in node:
                    type_field = field
                    print(f"Found type field: {field}")
                    break
            if type_field:
                break
                
        if not type_field:
            print("Warning: Could not identify a type field in nodes")
        
        # Identify path field
        path_field_candidates = ['path', 'file_path', 'rel_path']
        path_field = None
        
        for field in path_field_candidates:
            for node in sample_nodes:
                if field in node:
                    path_field = field
                    print(f"Found path field: {field}")
                    break
            if path_field:
                break
        
        # Sample edges to understand relationship types
        aql = f"""
        FOR e IN {edge_collection}
        LIMIT 10
        RETURN e
        """
        cursor = db.aql.execute(aql)
        sample_edges = [doc for doc in cursor]
        
        # Identify edge type field
        edge_type_field_candidates = ['edge_type', 'relation', 'relationship', 'type']
        edge_type_field = None
        
        for field in edge_type_field_candidates:
            for edge in sample_edges:
                if field in edge:
                    edge_type_field = field
                    print(f"Found edge type field: {field}")
                    break
            if edge_type_field:
                break
        
        print(f"Schema validation complete: type_field={type_field}, path_field={path_field}, edge_type_field={edge_type_field}")
    
    except Exception as e:
        print(f"Error validating schema: {str(e)}")
        traceback.print_exc()

#### Node Type Validation Function
- Verifies directory nodes exist in the graph by checking for 'directory' type nodes
- Fallbacks to alternative type field names if standard fields aren't found
- Validates edge relationships between directories, checking for 'contains_directory' edges
- Includes graceful error handling to allow process to continue even if validation issues occur

In [120]:
def validate_node_types():
    """Validate that all necessary node types are accessible in the graph"""
    try:
        # Check for directory nodes specifically
        aql = f"""
        FOR v IN {node_collection}
            FILTER v.type == 'directory'
            LIMIT 1
            RETURN v
        """
        cursor = db.aql.execute(aql)
        directories = [doc for doc in cursor]
        
        if not directories:
            print("Warning: No directory nodes found in the collection.")
            # Try alternative fields
            alternative_fields = ['ast_type', 'node_type']
            for field in alternative_fields:
                aql = f"""
                FOR v IN {node_collection}
                    FILTER v.{field} == 'directory' OR v.{field} == 'Directory'
                    LIMIT 1
                    RETURN v
                """
                cursor = db.aql.execute(aql)
                alternative_dirs = [doc for doc in cursor]
                if alternative_dirs:
                    print(f"Found directory nodes using alternate field: {field}")
                    break
        else:
            print(f"Found directory nodes successfully")
            
        # Also check for edges that connect directories
        aql = f"""
        FOR e IN {edge_collection}
            FILTER e.edge_type == 'contains_directory'
            LIMIT 1
            RETURN e
        """
        cursor = db.aql.execute(aql)
        dir_edges = [doc for doc in cursor]
        
        if not dir_edges:
            print("Warning: No 'contains_directory' edges found in the edge collection.")
            # Try alternative edge types
            alt_edge_types = ['contains', 'has_directory', 'parent']
            for edge_type in alt_edge_types:
                aql = f"""
                FOR e IN {edge_collection}
                    FILTER e.edge_type == '{edge_type}' OR e.relation == '{edge_type}' OR e.relationship == '{edge_type}'
                    FOR v1 IN {node_collection}
                        FILTER v1._id == e._from
                        FOR v2 IN {node_collection}
                            FILTER v2._id == e._to
                            FILTER (v1.type == 'directory' OR v2.type == 'directory')
                            LIMIT 1
                            RETURN e
                """
                cursor = db.aql.execute(aql)
                alt_dir_edges = [doc for doc in cursor]
                if alt_dir_edges:
                    print(f"Found directory edges using alternate edge type: {edge_type}")
                    break
        else:
            print(f"Found directory edge relationships successfully")
            
    except Exception as e:
        print(f"Error validating node types: {str(e)}")
        # Not raising the exception here to allow the process to continue

#### Database Schema Retrieval Function
- Collects comprehensive schema information including collections, graphs, and edge definitions
- Samples edges to identify relationship types and structures within the graph
- Organizes schema data hierarchically with graphs, collections, edge definitions, and types
- Returns structured dictionary with placeholders for node types and relationships to be filled by other functions

In [121]:
def get_db_schema() -> Dict:
    """Get detailed schema information with better type understanding"""
    try:
        # Basic schema information
        collections = db.collections()
        collection_names = [c['name'] for c in collections if not c['name'].startswith('_')]
        
        # Get graphs
        graphs = db.graphs()
        graph_names = [g['name'] for g in graphs]
        graph_details = []
        
        for graph_name in graph_names:
            graph = db.graph(graph_name)
            graph_info = graph.properties()
            
            # Get edge definitions for better understanding of relationships
            edge_definitions = graph_info.get('edgeDefinitions', [])
            enhanced_edge_defs = []
            
            for edge_def in edge_definitions:
                collection = edge_def.get('collection', '')
                from_collections = edge_def.get('from', [])
                to_collections = edge_def.get('to', [])
                
                # Sample some edges to understand relationship types
                edge_samples = []
                if collection:
                    try:
                        cursor = db.aql.execute(
                            f"FOR e IN {collection} LIMIT 5 RETURN e"
                        )
                        edge_samples = [edge for edge in cursor]
                    except Exception as e:
                        print(f"Error sampling edges from {collection}: {str(e)}")
                
                # Extract edge types if they exist
                edge_types = set()
                for edge in edge_samples:
                    if 'edge_type' in edge:
                        edge_types.add(edge['edge_type'])
                
                enhanced_edge_defs.append({
                    'collection': collection,
                    'from_collections': from_collections,
                    'to_collections': to_collections,
                    'edge_types': list(edge_types),
                    'sample_count': len(edge_samples),
                })
            
            graph_details.append({
                'name': graph_info.get('name'),
                'edge_definitions': enhanced_edge_defs,
                'orphan_collections': graph_info.get('orphanCollections', [])
            })
        
        return {
            "Graph Schema": graph_details,
            "Collection Schema": [c for c in collection_names],
            "Node Types": {},  # Will be filled by analyze_node_types
            "Type Relationships": []  # Will be filled later
        }
    except Exception as e:
        print(f"Error getting enhanced schema: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}

#### Node Type Analysis Function
- Analyzes node collection to identify distinct node types and their distribution
- Uses detected schema type field or falls back to inferring types from properties
- Samples each node type to understand structure and stores in the global schema dictionary
- Includes special handling for important node types like 'directory' and 'file' if not found directly

In [122]:
def analyze_node_types():
    """Analyze and cache the node types in the database using the detected schema fields"""
    node_types_dict = {}
    try:
        # Use the detected type field
        if not type_field:
            print("No type field detected, trying to infer node types from other properties")
            # Fallback logic to infer types
            return infer_node_types()
        
        # Query distinct node types
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "{type_field}")
            COLLECT type = v.{type_field} WITH COUNT INTO count
            RETURN {{
                "type": type,
                "count": count
            }}
        """
        cursor = db.aql.execute(aql)
        type_counts = [doc for doc in cursor]
        
        # For each node type, get a sample and analyze structure
        for type_info in type_counts:
            node_type = type_info.get('type')
            count = type_info.get('count', 0)
            
            if not node_type:
                continue
            
            # Get a sample for this node type
            aql = f"""
            FOR v IN {node_collection}
                FILTER v.{type_field} == '{node_type}'
                LIMIT 1
                RETURN v
            """
            cursor = db.aql.execute(aql)
            samples = [doc for doc in cursor]
            
            if not samples:
                continue
            
            sample = samples[0]
            
            # Normalize the node type name
            normalized_type = node_type
            
            # Add to node types dictionary
            node_types_dict[normalized_type] = {
                'count': count,
                'field': type_field,
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            
            print(f"Type: {node_type}, Count: {count}")
        
        # Update the db_schema with node types
        global db_schema
        db_schema["Node Types"] = node_types_dict
        
        # Special handling for directories and files if not found
        for important_type in ['directory', 'file']:
            if important_type not in node_types_dict:
                detect_special_type(important_type, node_types_dict)
                
        return node_types_dict
    except Exception as e:
        print(f"Error analyzing node types: {str(e)}")
        traceback.print_exc()
        return {}

#### Node Type Inference Function
- Provides a fallback approach when explicit type fields aren't found in the database
- Infers node types by searching for characteristic properties (filename, children, function_name, etc.)
- Implements property-based clustering as a second fallback to group similar nodes
- Adds inferred type information to the global schema with confidence indicators and inference basis

In [None]:
def infer_node_types():
    """Fallback method to infer node types when type field is not detected"""
    node_types_dict = {}
    try:
        # Try to infer types based on common patterns in node properties
        print("Attempting to infer node types from property patterns...")
        
        # Check for nodes with filename property (likely files)
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "filename") OR HAS(v, "file_name")
            LIMIT 100
            RETURN v
        """
        cursor = db.aql.execute(aql)
        file_nodes = [doc for doc in cursor]
        
        if file_nodes:
            sample = file_nodes[0]
            node_types_dict["file"] = {
                'count': len(file_nodes),
                'inferred': True,
                'inference_basis': 'Has filename property',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            print(f"Inferred type: file, Count: {len(file_nodes)}")
        
        # Check for nodes with children or is_directory properties (likely directories)
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "children") OR HAS(v, "is_directory") OR HAS(v, "dir_name")
            LIMIT 100
            RETURN v
        """
        cursor = db.aql.execute(aql)
        dir_nodes = [doc for doc in cursor]
        
        if dir_nodes:
            sample = dir_nodes[0]
            node_types_dict["directory"] = {
                'count': len(dir_nodes),
                'inferred': True,
                'inference_basis': 'Has directory-related properties',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            print(f"Inferred type: directory, Count: {len(dir_nodes)}")
        
        # Check for nodes with function-related properties
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "function_name") OR HAS(v, "params") OR HAS(v, "return_type")
            LIMIT 100
            RETURN v
        """
        cursor = db.aql.execute(aql)
        function_nodes = [doc for doc in cursor]
        
        if function_nodes:
            sample = function_nodes[0]
            node_types_dict["function"] = {
                'count': len(function_nodes),
                'inferred': True,
                'inference_basis': 'Has function-related properties',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            print(f"Inferred type: function, Count: {len(function_nodes)}")
        
        # Check for nodes with class-related properties
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "class_name") OR HAS(v, "methods") OR HAS(v, "extends")
            LIMIT 100
            RETURN v
        """
        cursor = db.aql.execute(aql)
        class_nodes = [doc for doc in cursor]
        
        if class_nodes:
            sample = class_nodes[0]
            node_types_dict["class"] = {
                'count': len(class_nodes),
                'inferred': True,
                'inference_basis': 'Has class-related properties',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            print(f"Inferred type: class, Count: {len(class_nodes)}")
        
        # Check for variable declarations
        aql = f"""
        FOR v IN {node_collection}
            FILTER HAS(v, "var_name") OR HAS(v, "variable_name") OR HAS(v, "variable_type")
            LIMIT 100
            RETURN v
        """
        cursor = db.aql.execute(aql)
        var_nodes = [doc for doc in cursor]
        
        if var_nodes:
            sample = var_nodes[0]
            node_types_dict["variable"] = {
                'count': len(var_nodes),
                'inferred': True,
                'inference_basis': 'Has variable-related properties',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            print(f"Inferred type: variable, Count: {len(var_nodes)}")
        
        # Fall back to clustering by property sets if no types inferred
        if not node_types_dict:
            print("No clear types inferred, attempting property-based clustering...")
            return cluster_by_properties()
        
        # Update the global schema with inferred node types
        global db_schema
        db_schema["Node Types"] = node_types_dict
        
        return node_types_dict
    
    except Exception as e:
        print(f"Error inferring node types: {str(e)}")
        traceback.print_exc()
        return {}

def cluster_by_properties():
    """Group nodes by similar property sets to infer types"""
    property_clusters = {}
    try:
        # Sample nodes to analyze property patterns
        aql = f"""
        FOR v IN {node_collection}
        LIMIT 200
        RETURN v
        """
        cursor = db.aql.execute(aql)
        nodes = [doc for doc in cursor]
        
        # Create property signatures for each node
        for i, node in enumerate(nodes):
            # Sort keys to create consistent signatures
            prop_keys = sorted(list(node.keys()))
            # Filter out system properties
            prop_keys = [k for k in prop_keys if not k.startswith('_')]
            
            # Create a signature string
            signature = ",".join(prop_keys)
            
            if signature not in property_clusters:
                property_clusters[signature] = {
                    'nodes': [],
                    'properties': prop_keys
                }
            
            property_clusters[signature]['nodes'].append(node)
        
        # Convert clusters to inferred types
        inferred_types = {}
        for i, (signature, cluster) in enumerate(property_clusters.items()):
            if len(cluster['nodes']) < 5:  # Skip tiny clusters
                continue
                
            # Try to find a meaningful name based on properties
            type_name = "unknown_type_" + str(i)
            for name_prop in ['name', 'type', 'kind', 'node_kind']:
                if name_prop in cluster['properties']:
                    most_common = {}
                    for node in cluster['nodes']:
                        val = str(node.get(name_prop, ''))
                        if val:
                            most_common[val] = most_common.get(val, 0) + 1
                    
                    if most_common:
                        # Get most common value
                        type_name = max(most_common.items(), key=lambda x: x[1])[0]
                        break
            
            inferred_types[type_name] = {
                'count': len(cluster['nodes']),
                'inferred': True,
                'inference_basis': 'Property signature clustering',
                'property_signature': signature,
                'sample_structure': cluster['properties'],
                'sample': cluster['nodes'][0] if cluster['nodes'] else {}
            }
            
            print(f"Inferred type via clustering: {type_name}, Count: {len(cluster['nodes'])}")
        
        # Update the db_schema
        global db_schema
        db_schema["Node Types"] = inferred_types
        
        return inferred_types
    
    except Exception as e:
        print(f"Error clustering by properties: {str(e)}")
        traceback.print_exc()
        return {}

#### Special Type Detection Function
- Implements specialized detection strategies for important node types (directories, files)
- Uses heuristic indicators like path patterns and property names to identify unlabeled nodes
- Leverages regex patterns on path fields to distinguish files (with extensions) from directories
- Updates the node types dictionary with inferred types when successful detection occurs

In [124]:
def detect_special_type(type_name, node_types_dict):
    """Try to detect special types like directories and files if they weren't found by regular means"""
    try:
        # Different detection strategies based on type
        if type_name == 'directory':
            # Look for nodes with directory-like properties
            indicators = ['path', 'directory', 'dir_name', 'folder']
            filter_conditions = []
            
            for indicator in indicators:
                filter_conditions.append(f'HAS(v, "{indicator}")')
                
            if path_field:
                # Add condition that path doesn't end with file extension
                filter_conditions.append(f'NOT REGEX_TEST(v.{path_field}, "\\.[a-zA-Z0-9]+$")')
            
            filter_str = " OR ".join(filter_conditions)
            
            aql = f"""
            FOR v IN {node_collection}
                FILTER {filter_str}
                LIMIT 100
                RETURN v
            """
            
        elif type_name == 'file':
            # Look for nodes with file-like properties
            indicators = ['file', 'file_name', 'filename']
            filter_conditions = []
            
            for indicator in indicators:
                filter_conditions.append(f'HAS(v, "{indicator}")')
                
            if path_field:
                # Add condition that path ends with file extension
                filter_conditions.append(f'REGEX_TEST(v.{path_field}, "\\.[a-zA-Z0-9]+$")')
            
            filter_str = " OR ".join(filter_conditions)
            
            aql = f"""
            FOR v IN {node_collection}
                FILTER {filter_str}
                LIMIT 100
                RETURN v
            """
        
        cursor = db.aql.execute(aql)
        detected_nodes = [doc for doc in cursor]
        
        if detected_nodes:
            print(f"Detected {len(detected_nodes)} potential {type_name} nodes")
            
            # Use the first node as a sample
            sample = detected_nodes[0]
            
            node_types_dict[type_name] = {
                'count': len(detected_nodes),
                'field': 'inferred',
                'sample_structure': list(sample.keys()),
                'sample': sample
            }
            
            print(f"Added inferred {type_name} type to node types")
        else:
            print(f"Could not detect any {type_name} nodes")
    
    except Exception as e:
        print(f"Error detecting {type_name} nodes: {str(e)}")

#### Type Relationship Analysis Function
- Examines connections between different node types in the graph database
- Searches for edges between each pair of node types to build a relationship map
- Identifies specific edge types that connect different node categories
- Updates the global schema dictionary with discovered type relationships

In [125]:
def analyze_type_relationships(node_types_dict):
    """Analyze relationships between different node types"""
    type_relationships = []
    try:
        node_type_keys = list(node_types_dict.keys())
        
        # For each node type pair, check if there are edges between them
        for from_type in node_type_keys:
            for to_type in node_type_keys:
                aql = f"""
                FOR v1 IN {node_collection}
                    FILTER v1.type == '{from_type}'
                    LIMIT 1
                    FOR v2 IN {node_collection}
                        FILTER v2.type == '{to_type}'
                        LIMIT 1
                        FOR e IN {edge_collection}
                            FILTER e._from == v1._id AND e._to == v2._id
                            RETURN DISTINCT {{
                                "from_type": '{from_type}',
                                "to_type": '{to_type}',
                                "edge_type": e.edge_type
                            }}
                """
                cursor = db.aql.execute(aql)
                relationships = [doc for doc in cursor]
                
                for rel in relationships:
                    type_relationships.append(rel)
        
        # Update db_schema with type relationships
        global db_schema
        db_schema["Type Relationships"] = type_relationships
        
    except Exception as e:
        print(f"Error analyzing type relationships: {str(e)}")
        traceback.print_exc()

#### Directory Structure Builder Function
- Constructs a hierarchical representation of the project's directory structure
- First attempts to identify directory nodes from the graph database
- Falls back to extracting directory paths from file nodes if needed
- Builds a nested tree structure with directories containing subdirectories and files
- Handles edge cases like missing directories by creating synthetic paths

In [126]:
def build_directory_structure() -> Dict:
    """
    Build a hierarchical representation of the directory structure
    Returns:
        Dictionary representing the directory tree
    """
    directory_tree = {}
    
    try:
        # First, identify all directory nodes
        directory_field = 'type'
        if 'directory' in node_types:
            directory_field = node_types['directory'].get('field', 'type')
            
        # Get all directory nodes
        aql = f"""
        FOR v IN {node_collection}
            FILTER v.{directory_field} == 'directory'
            RETURN {{
                "key": v._key,
                "path": v.path,
                "name": v.name
            }}
        """
        cursor = db.aql.execute(aql)
        directories = [doc for doc in cursor]
        
        # If no explicit directory nodes found, try to extract directories from file paths
        if not directories:
            # Extract directories from file paths
            all_directories = set()
            for file_info in files.values():
                file_path = file_info.get("file_path", "")
                if file_path:
                    # Extract all parent directories
                    parts = file_path.split('/')
                    for i in range(1, len(parts)):
                        dir_path = '/'.join(parts[:i])
                        if dir_path:
                            all_directories.add(dir_path)
            
            # Create synthetic directory nodes
            directories = [{"path": dir_path, "name": dir_path.split('/')[-1]} for dir_path in all_directories]
            
        # Build directory tree
        for directory in directories:
            path = directory.get("path", "")
            if not path:
                continue
                
            # Add to tree
            current = directory_tree
            parts = path.split('/')
            for i, part in enumerate(parts):
                if not part:
                    continue
                    
                if part not in current:
                    current[part] = {"files": [], "dirs": {}}
                    
                if i == len(parts) - 1:
                    # This is the target directory, add its key
                    current[part]["key"] = directory.get("key")
                else:
                    current = current[part]["dirs"]
                    
        # Add files to their respective directories
        for file_key, file_info in files.items():
            file_path = file_info.get("file_path", "")
            if not file_path:
                continue
                
            # Determine directory path and file name
            parts = file_path.split('/')
            file_name = parts[-1]
            dir_path = '/'.join(parts[:-1])
            
            # Find directory in tree
            current = directory_tree
            if dir_path:
                found = True
                for part in dir_path.split('/'):
                    if not part:
                        continue
                    if part in current:
                        current = current[part]["dirs"]
                    else:
                        # Directory not found in tree, create it
                        found = False
                        break
                        
                if not found:
                    # Create missing directory path
                    current = directory_tree
                    for part in dir_path.split('/'):
                        if not part:
                            continue
                        if part not in current:
                            current[part] = {"files": [], "dirs": {}}
                        current = current[part]["dirs"]
            
            # Find parent directory and add file
            parent = directory_tree
            for part in parts[:-1]:
                if not part:
                    continue
                if part not in parent:
                    parent[part] = {"files": [], "dirs": {}}
                parent = parent[part]["dirs"]
            
            # Add file to parent directory if parent exists and is a valid dictionary
            if parts[-2] in parent and isinstance(parent[parts[-2]], dict):
                parent[parts[-2]]["files"].append({
                    "key": file_key,
                    "name": file_name,
                    "path": file_path,
                    "language": file_info.get("language", "")
                })
    
    except Exception as e:
        print(f"Error building directory structure: {str(e)}")
        traceback.print_exc()
        
    return directory_tree


#### File and Code Knowledge Base Initialization
- Caches files, code snippets, and symbols from database with automatic field detection
- Establishes relationships between files, snippets, and symbols for fast traversal
- Extracts key metadata (file paths, languages, symbol definitions, documentation)

In [127]:
def initialize_cache():
    """Initialize cache of files, code snippets, and symbols using detected schema fields"""
    global files, snippets, symbols, symbol_name_index
    
    try:
        # Initialize file cache
        if 'file' in node_types:
            # Determine best field for file info
            field_info = node_types['file']
            
            path_field = None
            name_field = None
            
            # Try to find the best fields for path and name
            sample = field_info.get('sample', {})
            for field in sample:
                lower_field = field.lower()
                if 'path' in lower_field and not path_field:
                    path_field = field
                elif ('name' in lower_field or 'file' in lower_field) and 'path' not in lower_field and not name_field:
                    name_field = field
            
            # Use detected fields or defaults
            path_field = path_field or 'path'
            name_field = name_field or 'file_name'
            type_field = field_info.get('field') or 'type'
            
            aql = f"""
            FOR v IN {node_collection}
                FILTER v.{type_field} == 'file'
                RETURN v
            """
            cursor = db.aql.execute(aql)
            
            # Process each file
            for doc in cursor:
                file_key = doc.get('_key')
                file_path = doc.get(path_field, "")
                file_name = doc.get(name_field, "")
                
                if not file_path and not file_name:
                    continue
                
                if not file_path and file_name:
                    # Try to construct a path
                    for key in doc:
                        if 'dir' in key.lower() or 'folder' in key.lower():
                            directory = doc.get(key, "")
                            file_path = f"{directory}/{file_name}" if directory else file_name
                            break
                
                language = ""
                # Try to detect language from extension
                if file_path:
                    ext = file_path.split('.')[-1].lower() if '.' in file_path else ""
                    if ext == 'py':
                        language = 'python'
                    elif ext in ['js', 'ts']:
                        language = 'javascript'
                    elif ext in ['java']:
                        language = 'java'
                    elif ext in ['c', 'cpp', 'h', 'hpp']:
                        language = 'c/c++'
                
                files[file_key] = {
                    "key": file_key,
                    "file_name": file_name,
                    "file_path": file_path,
                    "language": language
                }
            
            print(f"Cached {len(files)} files")
            
        # Initialize snippet cache
        if 'snippet' in node_types:
            # Determine best fields for snippet info
            field_info = node_types['snippet']
            
            content_field = None
            name_field = None
            
            # Try to find the best fields for content and name
            sample = field_info.get('sample', {})
            for field in sample:
                lower_field = field.lower()
                if ('content' in lower_field or 'code' in lower_field) and not content_field:
                    content_field = field
                elif ('name' in lower_field or 'title' in lower_field) and not name_field:
                    name_field = field
            
            # Use detected fields or defaults
            content_field = content_field or 'content'
            name_field = name_field or 'snippet_name'
            type_field = field_info.get('field') or 'type'
            
            aql = f"""
            FOR v IN {node_collection}
                FILTER v.{type_field} == 'snippet'
                RETURN v
            """
            cursor = db.aql.execute(aql)
            
            # Process each snippet
            for doc in cursor:
                snippet_key = doc.get('_key')
                content = doc.get(content_field, "")
                snippet_name = doc.get(name_field, "")
                
                if not content:
                    continue
                
                # Try to determine file relationship
                file_key = None
                for key in doc:
                    if 'file' in key.lower() and key != name_field:
                        file_key = doc.get(key)
                        break
                
                # Try to determine language
                language = ""
                for key in doc:
                    if 'lang' in key.lower():
                        language = doc.get(key, "")
                        break
                
                if not language and file_key in files:
                    language = files[file_key].get('language', "")
                
                snippets[snippet_key] = {
                    "key": snippet_key,
                    "snippet_name": snippet_name,
                    "content": content,
                    "file_key": file_key,
                    "language": language
                }
            
            print(f"Cached {len(snippets)} code snippets")
        
            # Initialize symbol cache
            if 'symbol' in node_types:  # This is checking for an exact match with 'symbol'
                # Determine best fields for symbol info
                field_info = node_types['symbol']
                
                name_field = None
                type_name_field = None
                
                # Try to find the best fields for symbol name and symbol type
                sample = field_info.get('sample', {})
                for field in sample:
                    lower_field = field.lower()
                    if 'name' in lower_field and not name_field:
                        name_field = field
                    elif ('type' in lower_field and 'name' in lower_field) and not type_name_field:
                        type_name_field = field
                
                # Add fallback detection for symbol name field
                if not name_field and 'context' in sample:
                    name_field = 'context'
                    print(f"Using 'context' as fallback for symbol name field")
                
                # Use detected fields or defaults
                name_field = name_field or 'symbol_name'
                type_name_field = type_name_field or 'symbol_type'
                type_field = field_info.get('field') or 'type'
                
                print(f"Using name_field: {name_field}, type_field: {type_field}")
                
                aql = f"""
                FOR v IN {node_collection}
                    FILTER v.{type_field} == 'symbol'
                    RETURN v
                """
                print(f"Symbol query: {aql}")
                cursor = db.aql.execute(aql)
                sample_symbols = [doc for doc in cursor]
                print(f"Sample symbol count: {len(sample_symbols)}")
                
                if sample_symbols:
                    print(f"Sample symbol fields: {list(sample_symbols[0].keys())}")
                    print(f"Sample symbol name value: {sample_symbols[0].get(name_field, 'NOT FOUND')}")
                    print(f"Sample symbol type value: {sample_symbols[0].get(type_name_field, 'NOT FOUND')}")

                # Re-execute the query
                cursor = db.aql.execute(aql)
                
                # Process counter
                processed_count = 0
                
                # Process each symbol
                for doc in cursor:
                    symbol_key = doc.get('_key')
                    symbol_name = doc.get(name_field, "")
                    symbol_type = doc.get(type_name_field, "")
                    
                    if not symbol_name:
                        # Try context as a fallback
                        symbol_name = doc.get('context', "")
                        if not symbol_name:
                            continue
                    
                    # Try to determine file relationship
                    file_key = None
                    for key in doc:
                        if 'file' in key.lower() and key != name_field:
                            file_key = doc.get(key)
                            break
                    
                    # Try to determine snippet relationship
                    snippet_key = None
                    for key in doc:
                        if 'snippet' in key.lower():
                            snippet_key = doc.get(key)
                            break
                    
                    # Try to get definition and documentation
                    definition = ""
                    documentation = doc.get('docstring', "")  # Try the known docstring field first
                    
                    for key in doc:
                        lower_key = key.lower()
                        if 'def' in lower_key or 'decl' in lower_key:
                            definition = doc.get(key, "")
                        elif ('doc' in lower_key or 'comment' in lower_key) and not documentation:
                            documentation = doc.get(key, "")
                    
                    symbols[symbol_key] = {
                        "key": symbol_key,
                        "symbol_name": symbol_name,
                        "symbol_type": symbol_type,
                        "file_key": file_key,
                        "snippet_key": snippet_key,
                        "definition": definition,
                        "documentation": documentation
                    }
                    
                    # Index by name for quick lookups
                    if symbol_name:
                        if symbol_name not in symbol_name_index:
                            symbol_name_index[symbol_name] = []
                        symbol_name_index[symbol_name].append(symbol_key)
                    
                    processed_count += 1
                    if processed_count % 200 == 0:
                        print(f"Processed {processed_count} symbols so far")
                
                print(f"Cached {len(symbols)} symbols")
            
        # Build relationship indexes for faster traversal
        build_relationship_indexes()
    
    except Exception as e:
        print(f"Error initializing cache: {str(e)}")
        traceback.print_exc()


#### Relationship Index Building for Knowledge Base
- Creates mapping indexes between files, snippets, and symbols for efficient lookup
- Builds file-to-snippets, file-to-symbols, and snippet-to-symbols relationships
- Enables quick traversal and relationship querying across codebase elements

In [128]:
def build_relationship_indexes():
    """Build indexes for quick relationship lookup between files, snippets and symbols"""
    global file_to_snippets, file_to_symbols, snippet_to_symbols
    
    try:
        # Build file -> snippets index
        for snippet_key, snippet in snippets.items():
            file_key = snippet.get('file_key')
            if file_key:
                if file_key not in file_to_snippets:
                    file_to_snippets[file_key] = []
                file_to_snippets[file_key].append(snippet_key)
        
        # Build file -> symbols index
        for symbol_key, symbol in symbols.items():
            file_key = symbol.get('file_key')
            if file_key:
                if file_key not in file_to_symbols:
                    file_to_symbols[file_key] = []
                file_to_symbols[file_key].append(symbol_key)
        
        # Build snippet -> symbols index
        for symbol_key, symbol in symbols.items():
            snippet_key = symbol.get('snippet_key')
            if snippet_key:
                if snippet_key not in snippet_to_symbols:
                    snippet_to_symbols[snippet_key] = []
                snippet_to_symbols[snippet_key].append(symbol_key)
        
        print("Built relationship indexes for files, snippets, and symbols")
    
    except Exception as e:
        print(f"Error building relationship indexes: {str(e)}")
        traceback.print_exc()


#### File Retrieval by Key Helper Function
- Fetches file information by key from cache or database if not already loaded
- Returns comprehensive file metadata including path, name, directory, and language
- Handles error cases gracefully with detailed logging for debugging

In [129]:
def get_file_by_key(file_key: str) -> Dict:
    """
    Helper method to retrieve file node by key
    
    Args:
        file_key: The key of the file node
        
    Returns:
        Dict containing file information
    """
    if file_key in files:
        return files[file_key]
    
    try:
        aql = f"""
        FOR file IN {node_collection}
            FILTER file._key == '{file_key}' AND file.type == 'file'
            RETURN {{
                "key": file._key,
                "directory": file.directory,
                "file_name": file.file_name,
                "file_path": file.path || (file.directory + '/' + file.file_name),
                "language": file.language
            }}
        """
        cursor = db.aql.execute(aql)
        found_files = [doc for doc in cursor]
        
        if found_files:
            files[file_key] = found_files[0]
            return found_files[0]
        
        return {}
    except Exception as e:
        print(f"Error retrieving file by key: {str(e)}")
        traceback.print_exc()
        return {}

#### Symbol Occurrence Finder Across Codebase
- Searches for symbol occurrences in both symbol nodes and code snippets
- Links found symbols with their file contexts including path and language information
- Handles dynamic field detection for different database schemas and snippet formats
- Provides comprehensive results with docstrings, line numbers and contextual information

In [130]:
def find_symbol_occurrences(symbol_name: str) -> List[Dict]:
    """
    Find all occurrences of a symbol using both the symbol nodes and code snippets
    
    Args:
        symbol_name: The name of the symbol to find
        
    Returns:
        List of dictionaries containing symbol occurrences
    """
    results = []
    
    try:
        # Look for symbol nodes
        if 'symbol' in node_types:
            aql = f"""
            FOR symbol IN {node_collection}
                FILTER symbol.type == 'symbol' AND symbol.name == '{symbol_name}'
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == symbol._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'
                            RETURN {{
                                "key": file._key,
                                "directory": file.directory,
                                "file_name": file.file_name,
                                "file_path": file.path || (file.directory + '/' + file.file_name),
                                "language": file.language
                            }}
                )
                RETURN {{
                    "type": "symbol",
                    "name": symbol.name,
                    "symbol_type": symbol.symbol_type,
                    "line_number": symbol.line_number,
                    "context": symbol.context,
                    "docstring": symbol.docstring,
                    "file": LENGTH(file) > 0 ? file[0] : null
                }}
            """
            cursor = db.aql.execute(aql)
            symbol_results = [doc for doc in cursor]
            results.extend(symbol_results)
        
        # Look for symbol occurrences in code snippets
        if 'snippet' in node_types:
            # Determine the best attribute for code based on the sample
            code_field = 'code_snippet'
            snippet_sample = node_types.get('snippet', {}).get('sample', {})
            
            if 'code_snippet' in snippet_sample:
                code_field = 'code_snippet'
            elif 'code' in snippet_sample:
                code_field = 'code'
            elif 'snippet' in snippet_sample:
                code_field = 'snippet'
            
            aql = f"""
            FOR snippet IN {node_collection}
                FILTER snippet.type == 'snippet' AND snippet.{code_field} LIKE '%{symbol_name}%'
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == snippet._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'
                            RETURN {{
                                "key": file._key,
                                "directory": file.directory,
                                "file_name": file.file_name,
                                "file_path": file.path || (file.directory + '/' + file.file_name),
                                "language": file.language
                            }}
                )
                RETURN {{
                    "type": "snippet",
                    "code": snippet.{code_field},
                    "start_line": snippet.start_line,
                    "end_line": snippet.end_line,
                    "file": LENGTH(file) > 0 ? file[0] : null
                }}
            """
            cursor = db.aql.execute(aql)
            snippet_results = [doc for doc in cursor]
            results.extend(snippet_results)
    
    except Exception as e:
        print(f"Error finding symbol occurrences: {str(e)}")
        traceback.print_exc()
    
    return results

#### Intelligent Symbol Name Search with Type Filtering
- Searches for functions, classes, or other symbols by name with optional type filtering
- Performs multi-strategy search using both symbol nodes and pattern matching in code snippets
- Handles diverse language patterns (Python, JavaScript, Java, C/C++, Go) for accurate detection
- Returns comprehensive results with file context, docstrings, and associated code snippets

In [131]:
def find_by_name(name: str, symbol_type: Optional[str] = None) -> List[Dict]:
    """
    Find function/class snippets by name with improved matching across all files
    
    Args:
        name: The name of the function/class to find
        symbol_type: Optional filter for symbol type (e.g., 'function', 'class')
        
    Returns:
        List of dictionaries containing matching symbols and snippets
    """
    results = []
    
    try:
        # Look for symbol nodes first
        if 'symbol' in node_types:
            type_filter = f" AND symbol.symbol_type == '{symbol_type}'" if symbol_type else ""
            
            aql = f"""
            FOR symbol IN {node_collection}
                FILTER symbol.type == 'symbol' AND symbol.name == '{name}'{type_filter}
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == symbol._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'
                            RETURN {{
                                "key": file._key,
                                "directory": file.directory,
                                "file_name": file.file_name,
                                "file_path": file.path || (file.directory + '/' + file.file_name),
                                "language": file.language
                            }}
                )
                LET snippet = (
                    FOR edge IN {edge_collection}
                        FILTER edge._from == symbol._id
                        FOR snippet IN {node_collection}
                            FILTER snippet._id == edge._to AND snippet.type == 'snippet'
                            RETURN snippet
                )
                RETURN {{
                    "type": "symbol",
                    "name": symbol.name,
                    "symbol_type": symbol.symbol_type,
                    "line_number": symbol.line_number,
                    "context": symbol.context,
                    "docstring": symbol.docstring,
                    "file": LENGTH(file) > 0 ? file[0] : null,
                    "snippet": LENGTH(snippet) > 0 ? snippet[0] : null
                }}
            """
            cursor = db.aql.execute(aql)
            symbol_results = [doc for doc in cursor]
            results.extend(symbol_results)
        
        # If no symbols found or symbol cache is empty, try fuzzy matching in snippets
        if not results and 'snippet' in node_types:
            # Determine the best attribute for code based on the sample
            code_field = 'code_snippet'
            snippet_sample = node_types.get('snippet', {}).get('sample', {})
            
            if 'code_snippet' in snippet_sample:
                code_field = 'code_snippet'
            elif 'code' in snippet_sample:
                code_field = 'code'
            elif 'snippet' in snippet_sample:
                code_field = 'snippet'
            
            # Common patterns for function/class definitions in different languages
            patterns = []
            
            if not symbol_type or symbol_type == 'function':
                patterns.extend([
                    f"function {name}",  # JavaScript
                    f"def {name}",       # Python
                    f"{name} = function", # JavaScript
                    f"const {name} = ", # JavaScript arrow function
                    f"let {name} = ",   # JavaScript arrow function
                    f"var {name} = ",   # JavaScript arrow function
                    f"{name}\\(",       # C/C++/Java method
                    f"func {name}",     # Go
                ])
            
            if not symbol_type or symbol_type == 'class':
                patterns.extend([
                    f"class {name}",     # Python/JavaScript/Java
                    f"interface {name}", # TypeScript/Java
                    f"struct {name}",    # C/C++/Go
                    f"type {name} struct", # Go
                ])
            
            # Create LIKE conditions for each pattern
            like_conditions = [f"snippet.{code_field} LIKE '%{pattern}%'" for pattern in patterns]
            like_filter = " OR ".join(like_conditions)
            
            aql = f"""
            FOR snippet IN {node_collection}
                FILTER snippet.type == 'snippet' AND ({like_filter})
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == snippet._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'
                            RETURN {{
                                "key": file._key,
                                "directory": file.directory,
                                "file_name": file.file_name,
                                "file_path": file.path || (file.directory + '/' + file.file_name),
                                "language": file.language
                            }}
                )
                RETURN {{
                    "type": "snippet",
                    "code": snippet.{code_field},
                    "start_line": snippet.start_line,
                    "end_line": snippet.end_line,
                    "file": LENGTH(file) > 0 ? file[0] : null
                }}
            """
            cursor = db.aql.execute(aql)
            snippet_results = [doc for doc in cursor]
            results.extend(snippet_results)
    
    except Exception as e:
        print(f"Error finding by name: {str(e)}")
        traceback.print_exc()
    
    return results

#### Intelligent Symbol Analysis and Discovery
- Queries and analyzes implementations of specific functions/classes across a codebase
- Searches by name with optional type filtering, collecting all occurrences
- Organizes results by file with implementation details (docstrings, line numbers, code snippets)
- Returns structured JSON with comprehensive analysis including LLM-enhanced insights

In [132]:
def analyze_symbol(name: str, symbol_type: Optional[str] = None) -> Dict:
    """
    Query about a specific function/class and get an analysis in JSON format.
    Will return all implementations across different files.
    
    Args:
        name: The name of the function/class to analyze
        symbol_type: Optional filter for symbol type (e.g., 'function', 'class')
        
    Returns:
        Dictionary containing analysis of the symbol
    """
    # First, find all occurrences
    occurrences = find_by_name(name, symbol_type)
    
    if not occurrences:
        return {"error": f"No {symbol_type or 'symbol'} named '{name}' found in the codebase"}
    
    # Extract code snippets and organize by file
    implementations_by_file = {}
    for occurrence in occurrences:
        file_info = occurrence.get("file", {})
        file_path = file_info.get("file_path", "unknown_path")
        
        if file_path not in implementations_by_file:
            implementations_by_file[file_path] = {
                "file_info": file_info,
                "implementations": []
            }
        
        if occurrence.get("type") == "symbol":
            # For symbol occurrence, get its snippet
            snippet = occurrence.get("snippet", {})
            implementations_by_file[file_path]["implementations"].append({
                "type": occurrence.get("symbol_type", "unknown"),
                "name": occurrence.get("name", name),
                "line_number": occurrence.get("line_number"),
                "docstring": occurrence.get("docstring", ""),
                "context": occurrence.get("context", ""),
                "code": snippet.get("code_snippet", snippet.get("code", snippet.get("snippet", "")))
            })
        elif occurrence.get("type") == "snippet":
            # For snippet occurrence
            implementations_by_file[file_path]["implementations"].append({
                "type": symbol_type or "unknown",
                "name": name,
                "line_number": occurrence.get("start_line"),
                "code": occurrence.get("code", "")
            })
    
    # Use Mistral LLM to analyze the symbol
    symbol_analysis = analyze_with_llm(name, symbol_type, implementations_by_file)
    
    return {
        "name": name,
        "type": symbol_type or "unknown",
        "implementations_count": len(occurrences),
        "files_count": len(implementations_by_file),
        "implementations_by_file": implementations_by_file,
        "analysis": symbol_analysis
    }

#### AI-Powered Code Symbol Analysis
- Leverages Mistral LLM to provide intelligent analysis of code symbols (functions, classes)
- Extracts and processes code snippets and docstrings from multiple implementations
- Generates structured insights including purpose, parameters, dependencies, and complexity
- Returns comprehensive JSON analysis with usage patterns and improvement suggestions

In [133]:
def analyze_with_llm(name: str, symbol_type: Optional[str], implementations: Dict) -> Dict:
    """
    Use Mistral API to analyze a symbol based on its implementations
    
    Args:
        name: The name of the symbol to analyze
        symbol_type: The type of the symbol (function, class, etc.)
        implementations: Dictionary with implementations by file
        
    Returns:
        Dictionary with LLM analysis
    """
    try:
        # Extract all code snippets from implementations
        all_code = []
        for file_path, file_data in implementations.items():
            for implementation in file_data["implementations"]:
                code = implementation.get("code", "")
                docstring = implementation.get("docstring", "")
                if code:
                    all_code.append(f"File: {file_path}\n{code}")
                if docstring:
                    all_code.append(f"Docstring: {docstring}")
        
        # Join all code with separators
        code_text = "\n\n" + "-" * 40 + "\n\n".join(all_code)
        
        # Create a prompt for the LLM
        prompt = f"""
        Please analyze this {symbol_type or 'symbol'} named '{name}' from a codebase:
        
        {code_text}
        
        Provide a JSON response with the following fields:
        1. purpose: A clear description of what this {symbol_type or 'symbol'} does
        2. parameters: List of parameters with their types and purpose (if applicable)
        3. return_value: What this {symbol_type or 'symbol'} returns (if applicable)
        4. dependencies: Other functions/classes/modules it depends on
        5. usage_pattern: How this {symbol_type or 'symbol'} is typically used
        6. edge_cases: Potential edge cases or error handling
        7. complexity: Analysis of time/space complexity (if applicable)
        8. suggestions: Any improvements or best practices that could be applied
        
        Format your response as a valid JSON object without any extra text or markdown.
        """
        
        # Create message for the LLM
        messages = [
            ChatMessage(role="user", content=prompt)
        ]
        
        # Get completion from Mistral
        chat_response = mistral_client.chat(
            model=model,
            messages=messages
        )
        
        # Extract the content from the response
        content = chat_response.choices[0].message.content
        
        # Try to parse the response as JSON
        try:
            analysis = json.loads(content)
            return analysis
        except json.JSONDecodeError:
            # If JSON parsing fails, return the raw text
            return {"raw_analysis": content}
        
    except Exception as e:
        print(f"Error analyzing with LLM: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}


#### Intelligent Error Analysis System
- Analyzes codebase error messages by extracting keywords and finding related code snippets
- Searches for similar error patterns and error handling approaches across the codebase
- Leverages Mistral LLM to provide detailed analysis with causes and suggested solutions
- Returns structured JSON with error classification, affected components, and preventive measures

In [134]:
def analyze_error(error_message: str) -> Dict:
    """
    Analyze a specific error message in the codebase and suggest solutions
    
    Args:
        error_message: The error message to analyze
        
    Returns:
        Dictionary containing error analysis and potential solutions
    """
    try:
        # First, search for similar error patterns in the code
        # Split error message into keywords
        keywords = error_message.lower().split()
        keywords = [kw for kw in keywords if len(kw) > 3]  # Filter out short words
        
        # Create LIKE conditions for each keyword
        code_field = 'code_snippet'
        snippet_sample = node_types.get('snippet', {}).get('sample', {})
        
        if 'code_snippet' in snippet_sample:
            code_field = 'code_snippet'
        elif 'code' in snippet_sample:
            code_field = 'code'
        elif 'snippet' in snippet_sample:
            code_field = 'snippet'
            
        # Find code snippets that might contain error handling for similar errors
        related_snippets = []
        
        for keyword in keywords:
            aql = f"""
            FOR snippet IN {node_collection}
                FILTER snippet.type == 'snippet' 
                AND (
                    snippet.{code_field} LIKE '%error%' 
                    AND snippet.{code_field} LIKE '%{keyword}%'
                )
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == snippet._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'
                            RETURN {{
                                "key": file._key,
                                "file_path": file.path || (file.directory + '/' + file.file_name)
                            }}
                )
                RETURN {{
                    "code": snippet.{code_field},
                    "start_line": snippet.start_line,
                    "end_line": snippet.end_line,
                    "file": LENGTH(file) > 0 ? file[0] : null
                }}
            """
            cursor = db.aql.execute(aql)
            for doc in cursor:
                if doc not in related_snippets:
                    related_snippets.append(doc)
        
        # Format snippets for LLM
        snippets_text = ""
        for i, snippet in enumerate(related_snippets):
            file_info = snippet.get("file", {})
            file_path = file_info.get("file_path", "unknown")
            code = snippet.get("code", "")
            
            snippets_text += f"\nSnippet {i+1} from {file_path}:\n{code}\n"
        
        # Create a prompt for the LLM
        prompt = f"""
        Please analyze this error message from a codebase:
        
        ```
        {error_message}
        ```
        
        I found these potentially related code snippets from the codebase:
        {snippets_text if snippets_text else "No directly related snippets found."}
        
        Provide a JSON response with the following fields:
        1. error_type: Classification of this error
        2. likely_causes: List of potential causes for this error
        3. affected_components: Which parts of the code might be affected
        4. solution_suggestions: Specific recommendations to fix this error
        5. preventive_measures: How to prevent this type of error in the future
        
        Format your response as a valid JSON object without any extra text or markdown.
        """
        
        # Create message for the LLM
        messages = [
            ChatMessage(role="user", content=prompt)
        ]
        
        # Get completion from Mistral
        chat_response = mistral_client.chat(
            model=model,
            messages=messages
        )
        
        # Extract the content from the response
        content = chat_response.choices[0].message.content
        
        # Try to parse the response as JSON
        try:
            analysis = json.loads(content)
            return {
                "error_message": error_message,
                "related_snippets_count": len(related_snippets),
                "analysis": analysis
            }
        except json.JSONDecodeError:
            # If JSON parsing fails, return the raw text
            return {
                "error_message": error_message,
                "related_snippets_count": len(related_snippets),
                "raw_analysis": content
            }
        
    except Exception as e:
        print(f"Error analyzing error: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}

#### Code Repository Structure Analyzer
- Extracts comprehensive database structure information from code repositories
- Maps relationships between node types, files, snippets, and symbols across the codebase
- Organizes code elements by language, directory structure, and relationship types
- Returns detailed JSON representation of repository architecture for improved navigation

In [135]:
def get_database_structure() -> Dict:
    """
    Answer questions about the database structure
    Returns:
        Dictionary containing information about the database structure
    """
    try:
        # Most of this information was already gathered during initialization
        # Just format it in a more user-friendly way
        # Extract node types with counts
        node_types_info = {}
        for node_type, info in node_types.items():
            node_types_info[node_type] = {
                "count": info.get("count", 0),
                "properties": info.get("sample_structure", [])
            }
            
        # Extract relationship types
        relationship_types = {}
        for rel in db_schema.get("Type Relationships", []):
            from_type = rel.get("from_type", "")
            to_type = rel.get("to_type", "")
            edge_type = rel.get("edge_type", "")
            key = f"{from_type}_to_{to_type}"
            if key not in relationship_types:
                relationship_types[key] = {
                    "from_type": from_type,
                    "to_type": to_type,
                    "edge_types": []
                }
            if edge_type and edge_type not in relationship_types[key]["edge_types"]:
                relationship_types[key]["edge_types"].append(edge_type)
                
        # Count files by language
        languages = {}
        for file_info in files.values():
            language = file_info.get("language", "unknown")
            if language not in languages:
                languages[language] = 0
            languages[language] += 1
            
        # Build directory structure map for improved path navigation
        directory_structure = build_directory_structure()
            
        return {
            "graph_name": graph_name,
            "node_collection": node_collection,
            "edge_collection": edge_collection,
            "node_types": node_types_info,
            "relationship_types": list(relationship_types.values()),
            "file_count": len(files),
            "snippet_count": len(snippets),
            "symbol_count": len(symbols),
            "languages": languages,
            "directory_structure": directory_structure
        }
    except Exception as e:
        print(f"Error getting database structure: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}

#### Codebase Directory Analysis Tool
- Analyzes specific directory structures with flexible path matching capabilities
- Maps files, code snippets, and symbols contained within target directories
- Provides hierarchical directory content visualization for improved navigation
- Returns comprehensive JSON with file counts, symbols, and nested directory structures

In [136]:
def analyze_directory(path: str) -> Dict:
    """
    Analyze a specific directory in the codebase
    
    Args:
        path: Path to directory to analyze
    
    Returns:
        Dictionary with directory analysis results
    """
    try:
        print(f"Analyzing code structure at path: {path}")
        
        # Normalize path for consistent matching
        normalized_path = path.rstrip('/')
        
        # First try direct path matching for directory nodes
        print(f"Looking for files with path pattern: {normalized_path}")
        
        # Query files with matching path prefix
        matching_files = []
        for file_key, file_info in files.items():
            file_path = file_info.get("file_path", "")
            if file_path and (file_path.startswith(f"{normalized_path}/") or file_path == normalized_path):
                matching_files.append(file_info)
        
        # Sort files for consistent output
        matching_files.sort(key=lambda x: x.get("file_path", ""))
        
        # Print sample paths for debugging
        print("Sample file paths in database:")
        for i, file_info in enumerate(list(files.values())[:6]):
            print(f"File {i+1}: {file_info.get('file_path', '')}")
        
        # If no files found with direct path matching, try more flexible matching
        if not matching_files:
            # Try to find files that might contain the path (handle relative paths)
            for file_key, file_info in files.items():
                file_path = file_info.get("file_path", "")
                path_parts = normalized_path.split('/')
                
                # Check if all path parts appear in order in the file path
                if file_path:
                    file_parts = file_path.split('/')
                    for i in range(len(file_parts) - len(path_parts) + 1):
                        if file_parts[i:i+len(path_parts)] == path_parts:
                            matching_files.append(file_info)
                            break
            
            # Sort again after flexible matching
            matching_files.sort(key=lambda x: x.get("file_path", ""))
        
        # Get directory structure
        directory_structure = get_directory_contents(normalized_path)
        
        # Get snippets for matching files
        file_keys = [file_info.get("key") for file_info in matching_files]
        matching_snippets = []
        for snippet_key, snippet_info in snippets.items():
            if snippet_info.get("file_key") in file_keys:
                matching_snippets.append(snippet_info)
        
        # Get symbols for matching files
        matching_symbols = []
        for symbol_key, symbol_info in symbols.items():
            if symbol_info.get("file_key") in file_keys:
                matching_symbols.append(symbol_info)
        
        return {
            "path": normalized_path,
            "files": matching_files,
            "file_count": len(matching_files),
            "directory_structure": directory_structure,
            "snippets_count": len(matching_snippets),
            "symbols_count": len(matching_symbols)
        }
        
    except Exception as e:
        print(f"Error analyzing directory: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}

#### Directory Contents Explorer
- Retrieves and organizes file and subdirectory information from specified directory paths
- Maps file metadata including language, name

In [137]:
def get_directory_contents(path: str) -> Dict:
    """
    Get contents of a specific directory
    
    Args:
        path: Path to directory
    
    Returns:
        Dictionary with directory contents
    """
    contents = {"files": [], "subdirectories": []}
    
    # Normalize path
    normalized_path = path.rstrip('/')
    
    # Get files directly in this directory
    for file_key, file_info in files.items():
        file_path = file_info.get("file_path", "")
        if not file_path:
            continue
            
        file_dir = '/'.join(file_path.split('/')[:-1])
        
        if file_dir == normalized_path:
            contents["files"].append({
                "key": file_key,
                "name": file_info.get("file_name", ""),
                "path": file_path,
                "language": file_info.get("language", "")
            })
    
    # Get subdirectories
    seen_subdirs = set()
    for file_key, file_info in files.items():
        file_path = file_info.get("file_path", "")
        if not file_path or not file_path.startswith(f"{normalized_path}/"):
            continue
            
        # Get next directory level
        remaining_path = file_path[len(normalized_path)+1:]
        if '/' in remaining_path:
            subdir = remaining_path.split('/')[0]
            subdir_path = f"{normalized_path}/{subdir}"
            
            if subdir_path not in seen_subdirs:
                seen_subdirs.add(subdir_path)
                contents["subdirectories"].append({
                    "name": subdir,
                    "path": subdir_path
                })
    
    return contents

#### Codebase Search Engine
- Finds code snippets containing specific search terms across the entire codebase
- Dynamically adapts to different database schemas by detecting appropriate code field names
- Retrieves complete context including file paths, line numbers, and language information
- Returns comprehensive results with full code snippets and their associated file metadata

In [138]:
def search_code(term: str) -> List[Dict]:
    """
    Search for code containing a specific term
    
    Args:
        term: The term to search for
        
    Returns:
        List of dictionaries containing matching code snippets
    """
    results = []
    
    try:
        # Determine the best attribute for code based on the sample
        code_field = 'code_snippet'
        snippet_sample = node_types.get('snippet', {}).get('sample', {})
        
        if 'code_snippet' in snippet_sample:
            code_field = 'code_snippet'
        elif 'code' in snippet_sample:
            code_field = 'code'
        elif 'snippet' in snippet_sample:
            code_field = 'snippet'
        
        aql = f"""
        FOR snippet IN {node_collection}
            FILTER snippet.type == 'snippet' AND snippet.{code_field} LIKE '%{term}%'
            LET file = (
                FOR edge IN {edge_collection}
                    FILTER edge._to == snippet._id
                    FOR file IN {node_collection}
                        FILTER file._id == edge._from AND file.type == 'file'
                        RETURN {{
                            "key": file._key,
                            "directory": file.directory,
                            "file_name": file.file_name,
                            "file_path": file.path || (file.directory + '/' + file.file_name),
                            "language": file.language
                        }}
            )
            RETURN {{
                "key": snippet._key,
                "code": snippet.{code_field},
                "start_line": snippet.start_line,
                "end_line": snippet.end_line,
                "file": LENGTH(file) > 0 ? file[0] : null
            }}
        """
        cursor = db.aql.execute(aql)
        for doc in cursor:
            results.append(doc)
            
    except Exception as e:
        print(f"Error searching code: {str(e)}")
        traceback.print_exc()
    
    return results

#### Code Structure Analysis Tool
- Analyzes and visualizes the structure of a codebase for a specific file or directory
- Groups files by directory, counts symbols by type, and analyzes language distribution
- Leverages Mistral LLM to provide high-level insights about architecture patterns and key components
- Returns comprehensive analysis including file counts, directory structure, and improvement recommendations

In [139]:
def analyze_code_structure(path: Optional[str] = None) -> Dict:
    """
    Analyze and visualize the structure of the code, either for a specific file or directory
    
    Args:
        path: Optional path to focus the analysis on
        
    Returns:
        Dictionary containing code structure analysis
    """
    print(f"Analyzing code structure at path: {path}")
    # Print the query you're using to find files
    print(f"Looking for files with path pattern: {path}")
    # Print a few sample files from your cache for comparison
    print("Sample file paths in database:")
    for i, (key, file_info) in enumerate(files.items()):
        print(f"File {i+1}: {file_info.get('file_path', 'unknown')}")
        if i >= 5:
            break
            
    try:
        # If path is provided, filter by that path
        path_filter = ""
        if path:
            path_filter = f" AND (file.path LIKE '{path}/%' OR file.path == '{path}')"
        
        # First, gather file structure
        aql = f"""
        FOR file IN {node_collection}
            FILTER file.type == 'file'{path_filter}
            RETURN {{
                "key": file._key,
                "file_path": file.path || (file.directory + '/' + file.file_name),
                "language": file.language
            }}
        """
        cursor = db.aql.execute(aql)
        files_list = [doc for doc in cursor]
        
        # Group files by directory
        directory_structure = {}
        for file in files_list:
            file_path = file.get("file_path", "")
            if not file_path:
                continue
            
            # Split path and use all but the last part as directory
            path_parts = file_path.split('/')
            if len(path_parts) > 1:
                directory = '/'.join(path_parts[:-1])
                filename = path_parts[-1]
            else:
                directory = "."
                filename = file_path
            
            if directory not in directory_structure:
                directory_structure[directory] = []
            
            directory_structure[directory].append({
                "file_name": filename,
                "file_path": file_path,
                "key": file.get("key"),
                "language": file.get("language", "unknown")
            })
        
        # Count symbols by type and file
        symbol_counts = {}
        if 'symbol' in node_types:
            path_join = ""
            if path:
                path_join = f" AND (file.path LIKE '{path}/%' OR file.path == '{path}')"
            
            aql = f"""
            FOR symbol IN {node_collection}
                FILTER symbol.type == 'symbol'
                LET file = (
                    FOR edge IN {edge_collection}
                        FILTER edge._to == symbol._id
                        FOR file IN {node_collection}
                            FILTER file._id == edge._from AND file.type == 'file'{path_join}
                            RETURN file
                )
                FILTER LENGTH(file) > 0
                COLLECT file_path = file[0].path || (file[0].directory + '/' + file[0].file_name),
                        symbol_type = symbol.symbol_type WITH COUNT INTO count
                RETURN {{
                    "file_path": file_path,
                    "symbol_type": symbol_type,
                    "count": count
                }}
            """
            cursor = db.aql.execute(aql)
            for doc in cursor:
                file_path = doc.get("file_path", "")
                symbol_type = doc.get("symbol_type", "unknown")
                count = doc.get("count", 0)
                
                if file_path not in symbol_counts:
                    symbol_counts[file_path] = {}
                
                symbol_counts[file_path][symbol_type] = count
        
        # Prepare analysis data for LLM
        file_count = len(files_list)
        directory_count = len(directory_structure)
        
        # Prepare information for visualization
        directory_tree = []
        for directory, file_list in directory_structure.items():
            directory_tree.append({
                "directory": directory,
                "files": file_list,
                "file_count": len(file_list)
            })
        
        # Sort directories by file count (descending)
        directory_tree.sort(key=lambda x: x["file_count"], reverse=True)
        
        # Analyze distribution of languages
        language_counts = {}
        for file in files_list:
            language = file.get("language", "unknown")
            if language not in language_counts:
                language_counts[language] = 0
            language_counts[language] += 1
        
        # Create an analysis with Mistral
        if files_list:
            structure_info = {
                "file_count": file_count,
                "directory_count": directory_count,
                "top_directories": [d["directory"] for d in directory_tree[:5]],
                "language_distribution": language_counts,
                "symbol_type_distribution": symbol_counts
            }
            
            # Create a prompt for the LLM to analyze the structure
            prompt = f"""
            Please analyze this codebase structure:
            
            {json.dumps(structure_info, indent=2)}
            
            Provide a JSON response with the following fields:
            1. overview: High-level description of the codebase structure
            2. architecture_patterns: Any architectural patterns you can identify
            3. key_components: The most important directories/modules
            4. language_insights: Analysis of the programming language usage
            5. recommendations: Suggestions for organization or structure improvements
            
            Format your response as a valid JSON object without any extra text or markdown.
            """
            
            # Create message for the LLM
            messages = [
                ChatMessage(role="user", content=prompt)
            ]
            
            # Get completion from Mistral
            chat_response = mistral_client.chat(
                model=model,
                messages=messages
            )
            
            # Extract the content from the response
            content = chat_response.choices[0].message.content
            
            # Try to parse the response as JSON
            try:
                analysis = json.loads(content)
            except json.JSONDecodeError:
                # If JSON parsing fails, return the raw text
                analysis = {"raw_analysis": content}
        else:
            analysis = {"message": "No files found matching the specified path"}
        
        return {
            "path": path or "entire codebase",
            "file_count": file_count,
            "directory_count": directory_count,
            "directory_structure": directory_tree,
            "language_distribution": language_counts,
            "symbol_distribution": symbol_counts,
            "analysis": analysis
        }
        
    except Exception as e:
        print(f"Error analyzing code structure: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}

#### Natural Language Query Processor for Codebase
- Processes natural language queries about the codebase using LLM-powered understanding
- Routes queries to appropriate specialized functions (symbol search, error analysis, code structure)
- Provides fallback handling when direct matches aren't found in the codebase
- Returns comprehensive results with conversational explanations of technical findings

In [140]:
def process_query(query: str) -> Dict:
    """
    Process natural language queries about the codebase
    
    Args:
        query: Natural language query about the codebase
        
    Returns:
        Dictionary containing the response to the query
    """
    global conversation_history, mistral_client, model
    
    try:
        # Save the query to conversation history
        conversation_history.append({"role": "user", "content": query})
        
        # Get database structure for context
        db_structure = get_database_structure()
        
        # Create context for the LLM
        context = {
            "db_structure": db_structure,
            "conversation_history": conversation_history[-5:] if len(conversation_history) > 1 else []
        }
        
        # Create a prompt for the LLM to analyze the query and decide what action to take
        prompt = f"""
        You are a codebase assistant that helps users find information in their codebase.
        
        Database Structure:
        {json.dumps(db_structure, indent=2)}
        
        Available functions:
        1. find_symbol_occurrences(symbol_name): Find all occurrences of a symbol
        2. find_by_name(name, symbol_type): Find function/class snippets by name
        3. analyze_symbol(name, symbol_type): Get detailed analysis of a function/class
        4. analyze_error(error_message): Analyze an error message and suggest solutions
        5. search_code(term): Search for code containing specific terms
        6. analyze_code_structure(path): Analyze the structure of the code
        7. analyze_directory(path): Analyze a specific directory in the codebase
        
        Conversation History:
        {json.dumps(context["conversation_history"], indent=2)}
        
        User Query: {query}
        
        First, determine what the user is asking and which function would be most appropriate to answer their query.
        
        Return a JSON response with:
        1. understanding: Brief explanation of what you think the user is asking
        2. function_to_call: The most appropriate function to call based on the query
        3. parameters: Parameters to pass to the function
        
        Format your response as a valid JSON object without any extra text or markdown.
        """
        
        # Create message for the LLM
        messages = [
            ChatMessage(role="user", content=prompt)
        ]
        
        # Get completion from Mistral
        chat_response = mistral_client.chat(
            model=model,
            messages=messages
        )
        
        # Extract the content from the response
        content = chat_response.choices[0].message.content
        
        # Try to parse the response as JSON
        try:
            # Clean up the content to remove markdown code blocks if present
            cleaned_content = content
            if content.strip().startswith("```") and content.strip().endswith("```"):
                # Extract the content between the backticks
                cleaned_content = "\n".join(content.strip().split("\n")[1:-1])
            query_analysis = json.loads(cleaned_content)
        except json.JSONDecodeError:
            return {"error": "Failed to parse LLM response as JSON", "raw_response": content}
        
        # Get the function to call and parameters
        function_name = query_analysis.get("function_to_call", "")
        parameters = query_analysis.get("parameters", {})
        
        # Call the appropriate function based on the analysis
        result = None
        if function_name == "find_symbol_occurrences":
            symbol_name = parameters.get("symbol_name", "")
            if symbol_name:
                result = find_symbol_occurrences(symbol_name)
        elif function_name == "find_by_name":
            name = parameters.get("name", "")
            symbol_type = parameters.get("symbol_type")
            if name:
                result = find_by_name(name, symbol_type)
        elif function_name == "analyze_symbol":
            name = parameters.get("name", "")
            symbol_type = parameters.get("symbol_type")
            if name:
                result = analyze_symbol(name, symbol_type)
        elif function_name == "analyze_error":
            error_message = parameters.get("error_message", "")
            if error_message:
                result = analyze_error(error_message)
        elif function_name == "search_code":
            term = parameters.get("term", "")
            if term:
                result = search_code(term)
        elif function_name == "analyze_code_structure":
            path = parameters.get("path")
            result = analyze_code_structure(path)
        elif function_name == "analyze_directory":
            path = parameters.get("path", "")
            if path:
                result = analyze_directory(path)
        else:
            result = {"error": f"Unknown function: {function_name}"}
        
        # If result is None or empty, try to handle the query directly
        if result is None or (isinstance(result, list) and len(result) == 0):
            # Create a fallback prompt for the LLM
            fallback_prompt = f"""
            You are a codebase assistant that helps users find information in their codebase.
            
            Database Structure:
            {json.dumps(db_structure, indent=2)}
            
            Unfortunately, I couldn't find specific information to answer the user's query:
            
            User Query: {query}
            
            Please provide a helpful response based on the general codebase structure.
            Your response should:
            1. Acknowledge what information is missing
            2. Suggest alternative approaches based on the available database structure
            3. Ask for any clarification if needed
            
            Format your response as a conversation, not as JSON.
            """
            
            # Create message for the LLM
            fallback_messages = [
                ChatMessage(role="user", content=fallback_prompt)
            ]
            
            # Get completion from Mistral
            fallback_response = mistral_client.chat(
                model=model,
                messages=fallback_messages
            )
            
            # Extract the content from the response
            fallback_content = fallback_response.choices[0].message.content
            
            # Add the fallback response to conversation history
            conversation_history.append({"role": "assistant", "content": fallback_content})
            
            return {
                "query": query,
                "understanding": query_analysis.get("understanding", ""),
                "response_type": "fallback",
                "response": fallback_content
            }
        
        # Generate a user-friendly explanation of the result
        explanation_prompt = f"""
        You are a codebase assistant that helps users find information in their codebase.
        
        User Query: {query}
        
        Understanding: {query_analysis.get("understanding", "")}
        
        Result: {json.dumps(result, indent=2)}
        
        Please explain these results to the user in a clear, conversational way.
        If results include code snippets, explain what the code does.
        If there are multiple results, summarize the key findings.
        Include specific details from the results to make your explanation concrete.
        
        Format your response as a conversation, not as JSON.
        """
        
        # Create message for the LLM
        explanation_messages = [
            ChatMessage(role="user", content=explanation_prompt)
        ]
        
        # Get completion from Mistral
        explanation_response = mistral_client.chat(
            model=model,
            messages=explanation_messages
        )
        
        # Extract the content from the response
        explanation = explanation_response.choices[0].message.content
        
        # Add the explanation to conversation history
        conversation_history.append({"role": "assistant", "content": explanation})
        
        return {
            "query": query,
            "understanding": query_analysis.get("understanding", ""),
            "function_called": function_name,
            "parameters": parameters,
            "raw_result": result,
            "explanation": explanation
        }
        
    except Exception as e:
        print(f"Error processing query: {str(e)}")
        traceback.print_exc()
        return {"error": str(e)}



#### Codebase Conversational Interface
- Serves as the main entry point for users to interact with the codebase through natural language
- Processes queries through the query processor and handles different response types
- Provides graceful error handling and user-friendly error messages
- Returns conversational responses that explain technical codebase information in accessible language

In [141]:
def chat_with_codebase(query: str) -> str:
    """
    Main conversational function that processes user queries about the codebase
    
    Args:
        query: User's natural language query
        
    Returns:
        String containing the response to the user
    """
    try:
        # Process the query
        result = process_query(query)
        
        # If an error occurred, return an error message
        if "error" in result:
            error_message = result.get("error", "An unknown error occurred")
            if "raw_response" in result:
                return f"I encountered an error: {error_message}\n\nRaw response from LLM: {result['raw_response']}"
            return f"I encountered an error: {error_message}"
        
        # If the result contains an explanation, return it
        if "explanation" in result:
            return result["explanation"]
        
        # If the result contains a response, return it
        if "response" in result:
            return result["response"]
        
        # This is a fallback if neither explanation nor response are available
        return "I processed your query but couldn't generate a proper explanation. Please try rephrasing your question."
        
    except Exception as e:
        print(f"Error in chat_with_codebase: {str(e)}")
        traceback.print_exc()
        return f"I'm sorry, I encountered an error while processing your query: {str(e)}"


#### Conversation State Management
- Resets the conversation history to provide a clean slate for new interactions
- Clears accumulated context to prevent interactions from being influenced by previous queries
- Helps manage memory usage by removing stored conversation data
- Provides users with a way to start fresh conversations about different parts of the codebase

In [142]:
def reset_conversation():
    """Reset the conversation history"""
    global conversation_history
    conversation_history = []

#### Codebase Query System Initializer
- Sets up connections to ArangoDB and Mistral API for codebase analysis
- Dynamically discovers graph structure and initializes data caches for efficient querying
- Configures conversation state and relationship mappings between files, snippets, and symbols
- Provides flexible configuration through environment variables and explicit parameters

In [None]:
def initialize_codebase_query(
    db_name: str = "_system",
    username: str = "root",
    password: str = None,
    host: str = None,
    mistral_api_key: Optional[str] = None,
    model_name: str = "mistral-large-latest",
    graph: str = None
):
    """
    Initialize the codebase query system that dynamically discovers the graph structure.
    
    Args:
        db_name: ArangoDB database name
        username: ArangoDB username
        password: ArangoDB password
        host: ArangoDB host URL
        mistral_api_key: Mistral API key (if None, will try to get from environment)
        model_name: Mistral model to use
        graph: Graph name (if None, will try to discover the first available graph)
    """
    global db, client, mistral_client, model, graph_name, files, snippets, symbols
    global db_schema, node_types, symbol_name_index, file_to_snippets, file_to_symbols, snippet_to_symbols
    global conversation_history, node_collection, edge_collection
    
    # Connect to ArangoDB
    if not host:
        host = os.environ.get("ARANGO_HOST", "http://localhost:8529")
    client = ArangoClient(hosts=host)
    
    if not password:
        password = os.environ.get("ARANGO_PASSWORD")
        if not password:
            raise ValueError("ArangoDB password not provided and not found in environment")
    
    db = client.db(db_name, username=username, password=password)
    
    # Connect to Mistral API
    if mistral_api_key is None:
        mistral_api_key = os.environ.get("MISTRAL_API_KEY")
    if mistral_api_key is None:
        raise ValueError("Mistral API key not provided and not found in environment")
    
    # Initialize Mistral client
    mistral_client = MistralClient(api_key=mistral_api_key)
    model = model_name
    
    # Dynamically discover graph structure
    graph_name = graph
    
    # Initialize data structures
    files = {}
    snippets = {}
    symbols = {}
    symbol_name_index = {}
    file_to_snippets = {}
    file_to_symbols = {}
    snippet_to_symbols = {}
    conversation_history = []
    
    # Discover graph structure
    discover_graph_structure()
    
    # Get schema information
    db_schema = get_db_schema()
    
    # Analyze node types
    node_types = analyze_node_types()
    
    # Initialize cache
    initialize_cache()
    
    print("Codebase query system initialized successfully")

#### Codebase Query System Usage Example
- Demonstrates practical implementation in a Jupyter notebook with environment variable configuration
- Shows complete workflow from initialization to querying and conversation management
- Includes environment loading, system initialization with credentials, and graph specification
- Provides a concrete example of querying for error handling functions and conversation reset

In [152]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

graph = "FlaskRepv1"

# Initialize the system
initialize_codebase_query(
    db_name=os.getenv("ARANGO_DB_NAME"),
    username=os.getenv("ARANGO_USERNAME"),
    password=os.getenv("ARANGO_PASSWORD"),
    host=os.getenv("ARANGO_HOST"),
    mistral_api_key=os.getenv("MISTRAL_API_KEY"),
    model_name=os.getenv("MISTRAL_MODEL_NAME"),
    graph=graph
)

# Ask a question about the codebase
query = "Explain me the has_level_handler function in my codebase"
response = chat_with_codebase(query)
print(response)

# Reset the conversation if needed
reset_conversation()

No edge definitions found, using default naming pattern
Using default collections: Nodes=FlaskRepv1_node, Edges=FlaskRepv1_node_to_FlaskRepv1_node
Found type field: type
Found edge type field: edge_type
Schema validation complete: type_field=type, path_field=None, edge_type_field=edge_type
Type: directory, Count: 69
Type: file, Count: 83
Type: snippet, Count: 930
Type: symbol, Count: 1893
Cached 83 files
Cached 930 code snippets
Using 'context' as fallback for symbol name field
Using name_field: context, type_field: type
Symbol query: 
                FOR v IN FlaskRepv1_node
                    FILTER v.type == 'symbol'
                    RETURN v
                
Sample symbol count: 1893
Sample symbol fields: ['_key', '_id', '_rev', 'type', 'symbol_type', 'line_number', 'context', 'docstring']
Sample symbol name value: def test_index(client, auth):
    response = client.get("/")
    assert b"Log In" in response.data
    assert b"Register" in response.data

    auth.login()
    resp

Traceback (most recent call last):
  File "/var/folders/_2/xx5z8xdj6j98wh4vt2b59jz80000gp/T/ipykernel_49116/3271334447.py", line 63, in process_query
    chat_response = mistral_client.chat(
  File "/Users/Viku/GitHub/scopium/.venv/lib/python3.10/site-packages/mistralai/client.py", line 202, in chat
    request = self._make_chat_request(
  File "/Users/Viku/GitHub/scopium/.venv/lib/python3.10/site-packages/mistralai/client_base.py", line 178, in _make_chat_request
    request_data["model"] = self._get_model(model)
  File "/Users/Viku/GitHub/scopium/.venv/lib/python3.10/site-packages/mistralai/client_base.py", line 50, in _get_model
    if self._default_model is None:
AttributeError: 'MistralClient' object has no attribute '_default_model'


### Appendix 1
- Class for visualising the graph made using a .html file

In [None]:
# import os
# import ast
# import networkx as nx
# from typing import Dict, Set, List, Tuple, Optional, Union
# import json
# from arango import ArangoClient
# import re
# import glob

# class CodebaseVisualizer:
#     def __init__(self, root_dir: str, supported_languages=None):
#         self.root_dir = root_dir
#         self.graph = nx.DiGraph()
#         self.file_contents: Dict[str, str] = {}
#         self.import_relations: Dict[str, List[Tuple[str, int]]] = {}  # file -> [(module, line_no)]
#         self.module_symbols: Dict[str, Dict[str, Dict[str, any]]] = {}  # file -> {symbol -> {type, line_no, context}}
#         self.symbol_references: Dict[str, List[Tuple[str, int, str]]] = {}  # symbol -> [(file, line_no, context)]
#         self.file_index: Dict[str, int] = {}  # Maps files to indices
#         self.current_index = 0
#         self.directories: Set[str] = set()
#         # Add a new index for all symbols to quickly locate them
#         self.symbol_index: Dict[str, List[Dict]] = {}  # symbol -> [{file, type, line_no, context}]
        
#         # Define supported languages
#         self.supported_languages = supported_languages or ["python", "cpp", "java", "go"]
        
#         # Language file extensions mapping
#         self.language_extensions = {
#             "python": [".py"],
#             "cpp": [".c", ".cpp", ".h", ".hpp", ".cc", ".cxx", ".hxx"],
#             "java": [".java"],
#             "go": [".go"]
#         }

#     def _get_next_index(self) -> int:
#         """Get next available index for file indexing."""
#         self.current_index += 1
#         return self.current_index

#     def _chunk_code(self, code: str, lines_per_chunk: int = 20) -> List[Dict]:
#         """
#         Chunk the given code into snippets.
#         Returns a list of dictionaries with 'code_snippet', 'start_line', and 'end_line'.
#         """
#         lines = code.splitlines()
#         chunks = []
#         for i in range(0, len(lines), lines_per_chunk):
#             chunk_lines = lines[i:i + lines_per_chunk]
#             chunk = {
#                 'code_snippet': '\n'.join(chunk_lines),
#                 'start_line': i + 1,
#                 'end_line': i + len(chunk_lines)
#             }
#             chunks.append(chunk)
#         return chunks

#     def _get_context_around_line(self, file_path: str, line_no: int, context_lines: int = 3) -> str:
#         """Extract context around a specific line in a file."""
#         if file_path not in self.file_contents:
#             return ""
        
#         lines = self.file_contents[file_path].splitlines()
#         start = max(0, line_no - context_lines - 1)
#         end = min(len(lines), line_no + context_lines)
        
#         context = "\n".join(lines[start:end])
#         return context

#     def _detect_language(self, file_path: str) -> str:
#         """Detect the programming language of a file based on its extension."""
#         _, ext = os.path.splitext(file_path)
#         ext = ext.lower()
        
#         for language, extensions in self.language_extensions.items():
#             if ext in extensions:
#                 return language
                
#         return "unknown"

#     def parse_files(self) -> None:
#         """Parse all files in the directory and build relationships."""
#         # First pass: Index all files and create directory nodes
#         for root, dirs, files in os.walk(self.root_dir):
#             # Add directory node
#             rel_dir = os.path.relpath(root, self.root_dir)
#             if rel_dir != '.':
#                 self.directories.add(rel_dir)
#                 self.graph.add_node(rel_dir, type='directory')
                
#                 # Add edge from parent directory to this directory (if not root)
#                 parent_dir = os.path.dirname(rel_dir)
#                 if parent_dir and parent_dir != '.':
#                     self.graph.add_edge(parent_dir, rel_dir, edge_type='contains_directory')

#             # Index files of supported languages
#             for file in files:
#                 file_path = os.path.join(root, file)
#                 rel_path = os.path.relpath(file_path, self.root_dir)
#                 file_language = self._detect_language(file_path)
                
#                 if file_language in self.supported_languages:
#                     self.file_index[rel_path] = self._get_next_index()
                    
#                     # Add node for this file
#                     self.graph.add_node(rel_path, type='file', file_index=self.file_index[rel_path], language=file_language)
                    
#                     # Connect file to its directory
#                     if rel_dir != '.':
#                         self.graph.add_edge(rel_dir, rel_path, edge_type='contains_file')
                    
#                     try:
#                         with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
#                             content = f.read()
#                             self.file_contents[rel_path] = content
#                             self._analyze_file(rel_path, content, file_language)
#                     except Exception as e:
#                         print(f"Error parsing {file_path}: {e}")
        
#         # Second pass: Find symbol references across files
#         for file_path, content in self.file_contents.items():
#             file_language = self._detect_language(file_path)
#             self._find_references_in_file(file_path, content, file_language)
        
#         # Build the symbol index after all analyses
#         self._build_symbol_index()

#     def _analyze_file(self, file_path: str, content: str, language: str) -> None:
#         """Analyze a file for imports and symbols with line numbers and context."""
#         if language == "python":
#             self._analyze_python_file(file_path, content)
#         elif language == "cpp":
#             self._analyze_cpp_file(file_path, content)
#         elif language == "java":
#             self._analyze_java_file(file_path, content)
#         elif language == "go":
#             self._analyze_go_file(file_path, content)

#     def _analyze_python_file(self, file_path: str, content: str) -> None:
#         """Analyze a Python file for imports and symbols."""
#         try:
#             tree = ast.parse(content)
#             imports = []
#             symbols = {}

#             for node in ast.walk(tree):
#                 # Track imports
#                 if isinstance(node, (ast.Import, ast.ImportFrom)):
#                     if isinstance(node, ast.Import):
#                         for name in node.names:
#                             imports.append((name.name, node.lineno))
#                     else:  # ImportFrom
#                         module = node.module if node.module else ''
#                         for name in node.names:
#                             imports.append((f"{module}.{name.name}" if module else name.name, node.lineno))

#                 # Track defined symbols with line numbers and context
#                 elif isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Assign)):
#                     if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
#                         symbol_name = node.name
#                         symbol_type = 'class' if isinstance(node, ast.ClassDef) else 'function'
#                         line_no = node.lineno
#                         context = self._extract_python_node_source(content, node)
                        
#                         symbols[symbol_name] = {
#                             'type': symbol_type,
#                             'line_no': line_no,
#                             'context': context,
#                             'docstring': ast.get_docstring(node)
#                         }
#                     elif isinstance(node, ast.Assign):
#                         # Handle variable assignments
#                         for target in node.targets:
#                             if isinstance(target, ast.Name):
#                                 symbol_name = target.id
#                                 line_no = node.lineno
#                                 context = self._extract_python_node_source(content, node)
                                
#                                 symbols[symbol_name] = {
#                                     'type': 'variable',
#                                     'line_no': line_no,
#                                     'context': context
#                                 }

#             self.import_relations[file_path] = imports
#             self.module_symbols[file_path] = symbols

#         except Exception as e:
#             print(f"Error analyzing Python file {file_path}: {e}")

#     def _extract_python_node_source(self, source: str, node) -> str:
#         """Extract the source code for a Python AST node."""
#         try:
#             lines = source.splitlines()
#             if hasattr(node, 'lineno') and hasattr(node, 'end_lineno'):
#                 start = node.lineno - 1
#                 end = getattr(node, 'end_lineno', start + 1)
#                 return '\n'.join(lines[start:end])
#             return ""
#         except Exception:
#             return ""

#     def _analyze_cpp_file(self, file_path: str, content: str) -> None:
#         """Analyze a C/C++ file for includes and symbols."""
#         imports = []
#         symbols = {}
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         # Regular expressions for C/C++ code analysis
#         include_pattern = re.compile(r'#include\s+[<"]([^>"]+)[>"]')
#         class_pattern = re.compile(r'(?:class|struct)\s+(\w+)')
#         function_pattern = re.compile(r'(\w+)\s*\([^)]*\)\s*(?:const|override|final|noexcept)?\s*(?:{|;)')
#         namespace_pattern = re.compile(r'namespace\s+(\w+)')
        
#         for line_no, line in enumerate(lines, 1):
#             # Find include statements
#             include_match = include_pattern.search(line)
#             if include_match:
#                 imports.append((include_match.group(1), line_no))
            
#             # Find class/struct definitions
#             class_match = class_pattern.search(line)
#             if class_match:
#                 class_name = class_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[class_name] = {
#                     'type': 'class',
#                     'line_no': line_no,
#                     'context': context
#                 }
            
#             # Find function definitions (simplified)
#             function_match = function_pattern.search(line)
#             if function_match and not line.strip().startswith('#') and not line.strip().startswith('//'):
#                 function_name = function_match.group(1)
#                 # Skip some common keywords that might be mistaken for functions
#                 if function_name not in ['if', 'while', 'for', 'switch', 'return']:
#                     context = self._get_context_around_line(file_path, line_no)
#                     symbols[function_name] = {
#                         'type': 'function',
#                         'line_no': line_no,
#                         'context': context
#                     }
            
#             # Find namespace definitions
#             namespace_match = namespace_pattern.search(line)
#             if namespace_match:
#                 namespace_name = namespace_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[namespace_name] = {
#                     'type': 'namespace',
#                     'line_no': line_no,
#                     'context': context
#                 }
        
#         self.import_relations[file_path] = imports
#         self.module_symbols[file_path] = symbols

#     def _analyze_java_file(self, file_path: str, content: str) -> None:
#         """Analyze a Java file for imports and symbols."""
#         imports = []
#         symbols = {}
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         # Regular expressions for Java code analysis
#         package_pattern = re.compile(r'package\s+([\w.]+)')
#         import_pattern = re.compile(r'import\s+([\w.]+(?:\.\*)?)')
#         class_pattern = re.compile(r'(?:public|private|protected)?\s*(?:abstract|final)?\s*class\s+(\w+)')
#         interface_pattern = re.compile(r'(?:public|private|protected)?\s*interface\s+(\w+)')
#         method_pattern = re.compile(r'(?:public|private|protected)?\s*(?:static|final|abstract)?\s*(?:[\w<>[\],\s]+)\s+(\w+)\s*\([^)]*\)')
        
#         for line_no, line in enumerate(lines, 1):
#             # Find package declaration
#             package_match = package_pattern.search(line)
#             if package_match:
#                 package_name = package_match.group(1)
#                 imports.append((package_name, line_no))
            
#             # Find import statements
#             import_match = import_pattern.search(line)
#             if import_match:
#                 import_name = import_match.group(1)
#                 imports.append((import_name, line_no))
            
#             # Find class definitions
#             class_match = class_pattern.search(line)
#             if class_match:
#                 class_name = class_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[class_name] = {
#                     'type': 'class',
#                     'line_no': line_no,
#                     'context': context
#                 }
            
#             # Find interface definitions
#             interface_match = interface_pattern.search(line)
#             if interface_match:
#                 interface_name = interface_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[interface_name] = {
#                     'type': 'interface',
#                     'line_no': line_no,
#                     'context': context
#                 }
            
#             # Find method definitions
#             method_match = method_pattern.search(line)
#             if method_match:
#                 method_name = method_match.group(1)
#                 # Skip some common keywords that might be mistaken for methods
#                 if method_name not in ['if', 'while', 'for', 'switch', 'return']:
#                     context = self._get_context_around_line(file_path, line_no)
#                     symbols[method_name] = {
#                         'type': 'method',
#                         'line_no': line_no,
#                         'context': context
#                     }
        
#         self.import_relations[file_path] = imports
#         self.module_symbols[file_path] = symbols

#     def _analyze_go_file(self, file_path: str, content: str) -> None:
#         """Analyze a Go file for imports and symbols."""
#         imports = []
#         symbols = {}
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         # Regular expressions for Go code analysis
#         package_pattern = re.compile(r'package\s+(\w+)')
#         import_single_pattern = re.compile(r'import\s+"([^"]+)"')
#         import_multi_start_pattern = re.compile(r'import\s+\(')
#         import_multi_line_pattern = re.compile(r'\s*"([^"]+)"')
#         func_pattern = re.compile(r'func\s+(?:\([^)]+\)\s+)?(\w+)')
#         struct_pattern = re.compile(r'type\s+(\w+)\s+struct')
#         interface_pattern = re.compile(r'type\s+(\w+)\s+interface')
        
#         in_import_block = False
        
#         for line_no, line in enumerate(lines, 1):
#             # Find package declaration
#             package_match = package_pattern.search(line)
#             if package_match:
#                 package_name = package_match.group(1)
#                 imports.append((f"package {package_name}", line_no))
            
#             # Handle single-line imports
#             import_match = import_single_pattern.search(line)
#             if import_match:
#                 import_name = import_match.group(1)
#                 imports.append((import_name, line_no))
            
#             # Handle multi-line imports
#             if import_multi_start_pattern.search(line):
#                 in_import_block = True
#                 continue
            
#             if in_import_block:
#                 if line.strip() == ')':
#                     in_import_block = False
#                     continue
                    
#                 import_line_match = import_multi_line_pattern.search(line)
#                 if import_line_match:
#                     import_name = import_line_match.group(1)
#                     imports.append((import_name, line_no))
            
#             # Find function definitions
#             func_match = func_pattern.search(line)
#             if func_match:
#                 func_name = func_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[func_name] = {
#                     'type': 'function',
#                     'line_no': line_no,
#                     'context': context
#                 }
            
#             # Find struct definitions
#             struct_match = struct_pattern.search(line)
#             if struct_match:
#                 struct_name = struct_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[struct_name] = {
#                     'type': 'struct',
#                     'line_no': line_no,
#                     'context': context
#                 }
            
#             # Find interface definitions
#             interface_match = interface_pattern.search(line)
#             if interface_match:
#                 interface_name = interface_match.group(1)
#                 context = self._get_context_around_line(file_path, line_no)
#                 symbols[interface_name] = {
#                     'type': 'interface',
#                     'line_no': line_no,
#                     'context': context
#                 }
        
#         self.import_relations[file_path] = imports
#         self.module_symbols[file_path] = symbols

#     def _find_references_in_file(self, file_path: str, content: str, language: str) -> None:
#         """Find references to symbols in a file based on its language."""
#         if language == "python":
#             self._find_references_in_python_file(file_path, content)
#         elif language == "cpp":
#             self._find_references_in_cpp_file(file_path, content)
#         elif language == "java":
#             self._find_references_in_java_file(file_path, content)
#         elif language == "go":
#             self._find_references_in_go_file(file_path, content)

#     def _find_references_in_python_file(self, file_path: str, content: str) -> None:
#         """Find references to symbols in a Python file."""
#         try:
#             tree = ast.parse(content)
            
#             for node in ast.walk(tree):
#                 # Find variable references
#                 if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
#                     symbol_name = node.id
#                     line_no = node.lineno
                    
#                     # Track reference with context
#                     if symbol_name not in self.symbol_references:
#                         self.symbol_references[symbol_name] = []
                    
#                     context = self._get_context_around_line(file_path, line_no)
#                     self.symbol_references[symbol_name].append((file_path, line_no, context))
                
#                 # Find attribute references (e.g., obj.method())
#                 elif isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
#                     attr_name = node.attr
#                     line_no = node.lineno
                    
#                     if attr_name not in self.symbol_references:
#                         self.symbol_references[attr_name] = []
                    
#                     context = self._get_context_around_line(file_path, line_no)
#                     self.symbol_references[attr_name].append((file_path, line_no, context))
        
#         except Exception as e:
#             print(f"Error finding references in Python file {file_path}: {e}")

#     def _find_references_in_cpp_file(self, file_path: str, content: str) -> None:
#         """Find references to symbols in a C/C++ file."""
#         # Get all symbol names from all files to check for references
#         all_symbols = set()
#         for symbols_dict in self.module_symbols.values():
#             all_symbols.update(symbols_dict.keys())
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         for line_no, line in enumerate(lines, 1):
#             # Look for references to any known symbol
#             for symbol_name in all_symbols:
#                 # Simple pattern matching (would be more robust with proper C++ parsing)
#                 pattern = r'\b' + re.escape(symbol_name) + r'\b'
#                 if re.search(pattern, line):
#                     # Check if this is not the definition line
#                     if (file_path in self.module_symbols and 
#                         symbol_name in self.module_symbols[file_path] and 
#                         self.module_symbols[file_path][symbol_name]['line_no'] == line_no):
#                         continue
                    
#                     if symbol_name not in self.symbol_references:
#                         self.symbol_references[symbol_name] = []
                    
#                     context = self._get_context_around_line(file_path, line_no)
#                     self.symbol_references[symbol_name].append((file_path, line_no, context))

#     def _find_references_in_java_file(self, file_path: str, content: str) -> None:
#         """Find references to symbols in a Java file."""
#         # Get all symbol names from all files to check for references
#         all_symbols = set()
#         for symbols_dict in self.module_symbols.values():
#             all_symbols.update(symbols_dict.keys())
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         for line_no, line in enumerate(lines, 1):
#             # Skip comment lines and import/package declarations
#             if (line.strip().startswith("//") or 
#                 line.strip().startswith("/*") or 
#                 line.strip().startswith("import ") or 
#                 line.strip().startswith("package ")):
#                 continue
            
#             # Look for references to any known symbol
#             for symbol_name in all_symbols:
#                 # Simple pattern matching with word boundaries
#                 pattern = r'\b' + re.escape(symbol_name) + r'\b'
#                 if re.search(pattern, line):
#                     # Check if this is not the definition line
#                     if (file_path in self.module_symbols and 
#                         symbol_name in self.module_symbols[file_path] and 
#                         self.module_symbols[file_path][symbol_name]['line_no'] == line_no):
#                         continue
                    
#                     if symbol_name not in self.symbol_references:
#                         self.symbol_references[symbol_name] = []
                    
#                     context = self._get_context_around_line(file_path, line_no)
#                     self.symbol_references[symbol_name].append((file_path, line_no, context))

#     def _find_references_in_go_file(self, file_path: str, content: str) -> None:
#         """Find references to symbols in a Go file."""
#         # Get all symbol names from all files to check for references
#         all_symbols = set()
#         for symbols_dict in self.module_symbols.values():
#             all_symbols.update(symbols_dict.keys())
        
#         # Process content line by line
#         lines = content.splitlines()
        
#         for line_no, line in enumerate(lines, 1):
#             # Skip comment lines and import/package declarations
#             if (line.strip().startswith("//") or 
#                 line.strip().startswith("/*") or 
#                 line.strip().startswith("import ") or 
#                 line.strip().startswith("package ")):
#                 continue
            
#             # Look for references to any known symbol
#             for symbol_name in all_symbols:
#                 # Simple pattern matching with word boundaries
#                 pattern = r'\b' + re.escape(symbol_name) + r'\b'
#                 if re.search(pattern, line):
#                     # Check if this is not the definition line
#                     if (file_path in self.module_symbols and 
#                         symbol_name in self.module_symbols[file_path] and 
#                         self.module_symbols[file_path][symbol_name]['line_no'] == line_no):
#                         continue
                    
#                     if symbol_name not in self.symbol_references:
#                         self.symbol_references[symbol_name] = []
                    
#                     context = self._get_context_around_line(file_path, line_no)
#                     self.symbol_references[symbol_name].append((file_path, line_no, context))

#     def _build_symbol_index(self) -> None:
#         """Build a comprehensive index of all symbols and where they're defined/used."""
#         # Initialize the symbol index
#         self.symbol_index = {}
        
#         # First, add all symbol definitions
#         for file_path, symbols in self.module_symbols.items():
#             for symbol_name, details in symbols.items():
#                 if symbol_name not in self.symbol_index:
#                     self.symbol_index[symbol_name] = []
                
#                 self.symbol_index[symbol_name].append({
#                     'file': file_path,
#                     'type': 'definition',
#                     'symbol_type': details['type'],
#                     'line_no': details['line_no'],
#                     'context': details.get('context', ''),
#                     'docstring': details.get('docstring', '')
#                 })
        
#         # Then, add all references
#         for symbol_name, references in self.symbol_references.items():
#             if symbol_name not in self.symbol_index:
#                 self.symbol_index[symbol_name] = []
            
#             for file_path, line_no, context in references:
#                 # Avoid duplicating references if they're already in definitions
#                 if not any(ref['file'] == file_path and ref['line_no'] == line_no and ref['type'] == 'definition' 
#                           for ref in self.symbol_index.get(symbol_name, [])):
#                     self.symbol_index[symbol_name].append({
#                         'file': file_path,
#                         'type': 'reference',
#                         'line_no': line_no,
#                         'context': context
#                     })

#     def build_graph(self) -> nx.DiGraph:
#         """Build the NetworkX graph with enhanced node and edge information."""
#         # We've already added basic file and directory nodes during parsing
#         # Now add more detailed connections and data
        
#         # Add nodes for all directories (if not already added)
#         for directory in self.directories:
#             if not self.graph.has_node(directory):
#                 self.graph.add_node(directory, type='directory')
            
#             # Ensure parent directories exist and are connected
#             parts = directory.split(os.sep)
#             for i in range(1, len(parts)):
#                 parent_path = os.sep.join(parts[:i])
#                 if parent_path and not self.graph.has_node(parent_path):
#                     self.graph.add_node(parent_path, type='directory')
#                     self.directories.add(parent_path)
                
#                 # Connect parent to child directory
#                 if parent_path:
#                     child_path = os.sep.join(parts[:i+1])
#                     self.graph.add_edge(parent_path, child_path, edge_type='contains_directory')
        
#         # Add nodes for all files with indices and code snippet nodes
#         for file_path, file_idx in self.file_index.items():
#             language = self._detect_language(file_path)
            
#             # Update file node if it exists, create it otherwise
#             if self.graph.has_node(file_path):
#                 self.graph.nodes[file_path].update({
#                     'file_index': file_idx,
#                     'directory': os.path.dirname(file_path),
#                     'language': language
#                 })
#             else:
#                 self.graph.add_node(file_path, 
#                                    type='file',
#                                    file_index=file_idx,
#                                    directory=os.path.dirname(file_path),
#                                    language=language)
            
#             # Connect file to its directory
#             directory = os.path.dirname(file_path)
#             if directory:
#                 # Make sure the directory node exists
#                 if not self.graph.has_node(directory):
#                     self.graph.add_node(directory, type='directory')
#                     self.directories.add(directory)
                
#                 # Add edge from directory to file if it doesn't exist
#                 if not self.graph.has_edge(directory, file_path):
#                     self.graph.add_edge(directory, file_path, edge_type='contains_file')
            
#             # Create snippet nodes for the entire file
#             if file_path in self.file_contents:
#                 chunks = self._chunk_code(self.file_contents[file_path])
#                 for idx, chunk_info in enumerate(chunks):
#                     snippet_node = f"{file_path}::snippet::{idx}"
#                     self.graph.add_node(snippet_node,
#                                        type='snippet',
#                                        code_snippet=chunk_info['code_snippet'],
#                                        start_line=chunk_info['start_line'],
#                                        end_line=chunk_info['end_line'],
#                                        language=language)
#                     # Connect file node to snippet node
#                     self.graph.add_edge(file_path, snippet_node, 
#                                        edge_type='contains_snippet',
#                                        start_line=chunk_info['start_line'],
#                                        end_line=chunk_info['end_line'])

#             # Add nodes for symbols in this file
#             for symbol, details in self.module_symbols.get(file_path, {}).items():
#                 symbol_node = f"{file_path}::{symbol}"
#                 self.graph.add_node(symbol_node, 
#                                    type='symbol',
#                                    symbol_type=details['type'],
#                                    line_number=details['line_no'],
#                                    context=details.get('context', ''),
#                                    docstring=details.get('docstring', ''))
#                 self.graph.add_edge(file_path, symbol_node, 
#                                    edge_type='defines',
#                                    line_number=details['line_no'])

#         # Add edges for imports with line numbers
#         for file_path, imports in self.import_relations.items():
#             for imp, line_no in imports:
#                 # Look for matching files or symbols
#                 for target_file, symbols in self.module_symbols.items():
#                     if imp in symbols:
#                         self.graph.add_edge(file_path, 
#                                            f"{target_file}::{imp}",
#                                            edge_type='import',
#                                            line_number=line_no)
#                     # For Python, handle module imports
#                     elif self._detect_language(file_path) == "python" and target_file.replace('.py', '').endswith(imp):
#                         self.graph.add_edge(file_path, 
#                                            target_file,
#                                            edge_type='import',
#                                            line_number=line_no)
#                     # For Java, handle package imports
#                     elif self._detect_language(file_path) == "java" and imp.startswith(os.path.splitext(os.path.basename(target_file))[0]):
#                         self.graph.add_edge(file_path, 
#                                            target_file,
#                                            edge_type='import',
#                                            line_number=line_no)
        
#         # Add edges for symbol references
#         for symbol, references in self.symbol_references.items():
#             for file_path, line_no, context in references:
#                 # Find symbol nodes that match this reference
#                 for target_file, symbols in self.module_symbols.items():
#                     if symbol in symbols:
#                         # Create reference edge
#                         self.graph.add_edge(file_path, 
#                                            f"{target_file}::{symbol}",
#                                            edge_type='references',
#                                            line_number=line_no,
#                                            context=context)
        
#         return self.graph

#     def export_to_arango(self, url: str, username: str, password: str, db_name: str = "codebase", 
#                          graph_name: str = "Custom_Flask", node_collection: str = "nodes", 
#                          edge_collection: str = "edges", overwrite: bool = False) -> None:
#         """
#         Export the NetworkX graph to ArangoDB.
        
#         Args:
#             url: ArangoDB server URL
#             username: ArangoDB username
#             password: ArangoDB password
#             db_name: Database name
#             graph_name: Graph name
#             node_collection: Node collection name
#             edge_collection: Edge collection name
#             overwrite: Whether to overwrite existing database
#         """
#         # Initialize ArangoDB client
#         client = ArangoClient(hosts=url)
#         sys_db = client.db('_system', username=username, password=password)
        
#         # Create or use existing database
#         if sys_db.has_database(db_name):
#             if overwrite:
#                 sys_db.delete_database(db_name)
#                 sys_db.create_database(db_name)
#                 print(f"Database '{db_name}' recreated.")
#             else:
#                 print(f"Using existing database '{db_name}'.")
#         else:
#             sys_db.create_database(db_name)
#             print(f"Database '{db_name}' created.")
        
#         # Connect to the database
#         db = client.db(db_name, username=username, password=password)
        
#         # Create or use existing collections
#         if db.has_collection(node_collection):
#             nodes = db.collection(node_collection)
#             nodes.truncate()
#         else:
#             nodes = db.create_collection(node_collection)
        
#         if db.has_collection(edge_collection):
#             edges = db.collection(edge_collection)
#             edges.truncate()
#         else:
#             edges = db.create_edge_collection(edge_collection)
        
#         # Create or use existing graph
#         if db.has_graph(graph_name):
#             graph = db.graph(graph_name)
#         else:
#             graph = db.create_graph(graph_name)
#             # Define edge definition
#             graph.create_edge_definition(
#                 edge_collection=edge_collection,
#                 from_vertex_collections=[node_collection],
#                 to_vertex_collections=[node_collection]
#             )
        
#         # Prepare nodes for ArangoDB (ensuring unique IDs)
#         node_mapping = {}  # Maps node names to ArangoDB keys
        
#         # Add nodes to ArangoDB
#         print("Adding nodes to ArangoDB...")
#         for node_name, node_attrs in self.graph.nodes(data=True):
#             # Create a sanitized key for ArangoDB
#             key = re.sub(r'[^a-zA-Z0-9_\-]', '_', node_name)
#             node_mapping[node_name] = key
            
#             # Include all attributes and the original node name
#             node_data = {
#                 '_key': key,
#                 'original_name': node_name
#             }
#             node_data.update(node_attrs)
            
#             # Handle special data types for ArangoDB
#             for attr, value in node_data.items():
#                 if isinstance(value, (set, tuple)):
#                     node_data[attr] = list(value)
            
#             # Insert the node
#             nodes.insert(node_data)
        
#         # Add edges to ArangoDB
#         print("Adding edges to ArangoDB...")
#         for src, dst, edge_attrs in self.graph.edges(data=True):
#             # Create edge with proper from/to
#             edge_data = {
#                 '_from': f"{node_collection}/{node_mapping[src]}",
#                 '_to': f"{node_collection}/{node_mapping[dst]}"
#             }
#             edge_data.update(edge_attrs)
            
#             # Handle special data types for ArangoDB
#             for attr, value in edge_data.items():
#                 if isinstance(value, (set, tuple)):
#                     edge_data[attr] = list(value)
            
#             # Insert the edge
#             edges.insert(edge_data)
        
#         print(f"Exported graph to ArangoDB: {len(self.graph.nodes())} nodes and {len(self.graph.edges())} edges.")

#     def query_database(self, url: str, username: str, password: str, db_name: str = "codebase", 
#                       query: str = None) -> List[Dict]:
#         """
#         Execute a query against the ArangoDB database.
        
#         Args:
#             url: ArangoDB server URL
#             username: ArangoDB username
#             password: ArangoDB password
#             db_name: Database name
#             query: AQL query string
            
#         Returns:
#             Query results as a list of dictionaries
#         """
#         client = ArangoClient(hosts=url)
#         db = client.db(db_name, username=username, password=password)
        
#         if query is None:
#             # Default query to get basic statistics
#             query = """
#             RETURN {
#                 "node_count": LENGTH(FOR v IN nodes RETURN v),
#                 "edge_count": LENGTH(FOR e IN edges RETURN e),
#                 "file_count": LENGTH(FOR v IN nodes FILTER v.type == 'file' RETURN v),
#                 "directory_count": LENGTH(FOR v IN nodes FILTER v.type == 'directory' RETURN v),
#                 "symbol_count": LENGTH(FOR v IN nodes FILTER v.type == 'symbol' RETURN v)
#             }
#             """
        
#         cursor = db.aql.execute(query)
#         return [doc for doc in cursor]

#     def export_to_json(self, output_path: str) -> None:
#         """
#         Export the graph data to a JSON file for backup or analysis outside ArangoDB.
        
#         Args:
#             output_path: Path to write the JSON file
#         """
#         data = {
#             "nodes": [],
#             "edges": []
#         }
        
#         # Export nodes
#         for node_name, attrs in self.graph.nodes(data=True):
#             node_data = {"id": node_name}
#             node_data.update(attrs)
#             data["nodes"].append(node_data)
        
#         # Export edges
#         for src, dst, attrs in self.graph.edges(data=True):
#             edge_data = {
#                 "source": src,
#                 "target": dst
#             }
#             edge_data.update(attrs)
#             data["edges"].append(edge_data)
        
#         # Write to file
#         with open(output_path, 'w', encoding='utf-8') as f:
#             json.dump(data, f, indent=2, ensure_ascii=False)
        
#         print(f"Exported graph to JSON file: {output_path}")

#     def analyze_codebase(self) -> Dict[str, any]:
#         """
#         Perform basic analysis on the codebase and return statistics.
        
#         Returns:
#             Dictionary with analysis results
#         """
#         stats = {
#             "total_files": len(self.file_index),
#             "total_directories": len(self.directories),
#             "total_symbols": sum(len(symbols) for symbols in self.module_symbols.values()),
#             "languages": {},
#             "file_sizes": {
#                 "min": float('inf'),
#                 "max": 0,
#                 "avg": 0
#             },
#             "symbol_types": {}
#         }
        
#         # Count files by language
#         for file_path in self.file_index:
#             lang = self._detect_language(file_path)
#             stats["languages"][lang] = stats["languages"].get(lang, 0) + 1
            
#             # Track file sizes
#             file_size = len(self.file_contents.get(file_path, ""))
#             stats["file_sizes"]["min"] = min(stats["file_sizes"]["min"], file_size)
#             stats["file_sizes"]["max"] = max(stats["file_sizes"]["max"], file_size)
        
#         # Calculate average file size
#         if stats["total_files"] > 0:
#             total_size = sum(len(content) for content in self.file_contents.values())
#             stats["file_sizes"]["avg"] = total_size / stats["total_files"]
#         else:
#             stats["file_sizes"]["min"] = 0
        
#         # Count symbols by type
#         for symbols in self.module_symbols.values():
#             for symbol, details in symbols.items():
#                 symbol_type = details.get("type", "unknown")
#                 stats["symbol_types"][symbol_type] = stats["symbol_types"].get(symbol_type, 0) + 1
        
#         return stats

#     def run_workflow(self, code_path: str, arango_url: str, username: str, password: str, 
#                     db_name: str = "codebase") -> Dict:
#         """
#         Run the complete workflow: parse files, build graph, export to ArangoDB, and analyze.
        
#         Args:
#             code_path: Path to the codebase
#             arango_url: ArangoDB server URL
#             username: ArangoDB username
#             password: ArangoDB password
#             db_name: Database name
            
#         Returns:
#             Analysis results
#         """
#         print(f"Processing codebase at: {code_path}")
        
#         # Parse files
#         self.parse_files()
#         print(f"Parsed {len(self.file_index)} files and {len(self.directories)} directories")
        
#         # Build graph
#         self.build_graph()
#         print(f"Built graph with {len(self.graph.nodes())} nodes and {len(self.graph.edges())} edges")
        
#         # Export to ArangoDB
#         self.export_to_arango(
#             url=arango_url,
#             username=username,
#             password=password,
#             db_name=db_name,
#             overwrite=True
#         )
        
#         # Analyze codebase
#         analysis = self.analyze_codebase()
#         print(f"Analysis complete: {analysis}")
        
#         return analysis
    
#     def validate_graph_and_data(self) -> dict:
#         """
#         Validate the parsed data and graph construction.
#         Returns a detailed report on what was found and potential issues.
#         """
#         report = {
#             "files": {
#                 "count": len(self.file_index),
#                 "samples": list(self.file_index.keys())[:5],  # First 5 files
#                 "extensions": {}
#             },
#             "directories": {
#                 "count": len(self.directories),
#                 "samples": list(self.directories)[:5]  # First 5 directories
#             },
#             "symbols": {
#                 "count": sum(len(symbols) for symbols in self.module_symbols.values()),
#                 "by_type": {},
#                 "samples": []
#             },
#             "graph": {
#                 "nodes": self.graph.number_of_nodes(),
#                 "edges": self.graph.number_of_edges(),
#                 "node_types": {},
#                 "edge_types": {}
#             },
#             "possible_issues": []
#         }
        
#         # Check file extensions
#         for file_path in self.file_index:
#             _, ext = os.path.splitext(file_path)
#             ext = ext.lower()
#             report["files"]["extensions"][ext] = report["files"]["extensions"].get(ext, 0) + 1
        
#         # Check for supported extensions
#         supported_exts = []
#         for lang, exts in self.language_extensions.items():
#             supported_exts.extend(exts)
        
#         if set(report["files"]["extensions"].keys()).isdisjoint(supported_exts):
#             report["possible_issues"].append("No files with supported extensions found.")
        
#         # Check symbol types
#         for file_path, symbols in self.module_symbols.items():
#             for symbol_name, details in symbols.items():
#                 symbol_type = details['type']
#                 report["symbols"]["by_type"][symbol_type] = report["symbols"]["by_type"].get(symbol_type, 0) + 1
                
#                 if len(report["symbols"]["samples"]) < 5:
#                     report["symbols"]["samples"].append({
#                         "name": symbol_name,
#                         "file": file_path,
#                         "type": symbol_type,
#                         "line": details['line_no']
#                     })
        
#         # Check graph node and edge types
#         for _, data in self.graph.nodes(data=True):
#             node_type = data.get('type', 'unknown')
#             report["graph"]["node_types"][node_type] = report["graph"]["node_types"].get(node_type, 0) + 1
        
#         for _, _, data in self.graph.edges(data=True):
#             edge_type = data.get('edge_type', 'unknown')
#             report["graph"]["edge_types"][edge_type] = report["graph"]["edge_types"].get(edge_type, 0) + 1
        
#         # Check if nodes match files and directories
#         if report["graph"]["node_types"].get("file", 0) != report["files"]["count"]:
#             report["possible_issues"].append(
#                 f"Mismatch between file count ({report['files']['count']}) and file nodes in graph ({report['graph']['node_types'].get('file', 0)})"
#             )
        
#         if report["graph"]["node_types"].get("directory", 0) != report["directories"]["count"]:
#             report["possible_issues"].append(
#                 f"Mismatch between directory count ({report['directories']['count']}) and directory nodes in graph ({report['graph']['node_types'].get('directory', 0)})"
#             )
        
#         # Check if symbols have corresponding nodes
#         symbol_count = report["symbols"]["count"]
#         symbol_nodes = report["graph"]["node_types"].get("symbol", 0)
#         if symbol_count != symbol_nodes:
#             report["possible_issues"].append(
#                 f"Mismatch between symbol count ({symbol_count}) and symbol nodes in graph ({symbol_nodes})"
#             )
        
#         # Validate directory structure
#         if report["directories"]["count"] > 0 and report["files"]["count"] > 0:
#             # Check if files are connected to their directories
#             contains_file_edges = report["graph"]["edge_types"].get("contains_file", 0)
#             if contains_file_edges < report["files"]["count"]:
#                 report["possible_issues"].append(
#                     f"Some files may not be properly connected to their directories ({contains_file_edges} edges for {report['files']['count']} files)"
#                 )
        
#         return report

In [None]:
# from pyvis.network import Network

# def visualize_codebase_graph(codebase_path, output_html="codebase_graph.html", limit_nodes=None):
#     """
#     Visualize the codebase graph using pyvis.
    
#     Args:
#         codebase_path: Path to the codebase directory
#         output_html: Output HTML file for the visualization/
#         limit_nodes: Optional limit on the number of nodes to display (for large codebases)
#     """
#     # Initialize and build the graph
#     visualizer = CodebaseVisualizer(codebase_path)
#     visualizer.parse_files()
#     graph = visualizer.build_graph()
    
#     # Export graph to JSON (optional)
#     visualizer.export_graph_json('codebase_graph.json')
    
#     # Create a pyvis network
#     net = Network(height="900px", width="100%", bgcolor="#222222", font_color="white")
    
#     # Configure physics
#     net.barnes_hut(gravity=-5000, central_gravity=0.3, spring_length=200)
    
#     # Define node groups and colors
#     node_colors = {
#         'file': '#4287f5',
#         'directory': '#42f5a7',
#         'symbol': '#f542cb',
#         'snippet': '#f5a742'
#     }
    
#     # If we need to limit nodes for performance
#     if limit_nodes and len(graph.nodes()) > limit_nodes:
#         # Focus on file and directory nodes, and limit symbol nodes
#         important_nodes = [node for node, data in graph.nodes(data=True) 
#                           if data.get('type') in ['file', 'directory']]
        
#         # Add some symbol nodes to reach the limit
#         symbols = [node for node, data in graph.nodes(data=True) 
#                   if data.get('type') == 'symbol']
        
#         # Take a subset of symbols based on connectivity
#         symbol_importance = sorted(
#             [(node, graph.degree(node)) for node in symbols],
#             key=lambda x: x[1],
#             reverse=True
#         )
        
#         important_symbols = [node for node, _ in symbol_importance[:limit_nodes - len(important_nodes)]]
#         selected_nodes = important_nodes + important_symbols
        
#         # Create a subgraph
#         graph = graph.subgraph(selected_nodes)
    
#     # Add nodes with appropriate styles
#     for node, node_data in graph.nodes(data=True):
#         node_type = node_data.get('type', 'unknown')
#         label = os.path.basename(node) if '/' in node else node
        
#         # Truncate very long labels
#         if len(label) > 30:
#             label = label[:27] + "..."
        
#         # Create hover title with more details
#         title = f"<div style='max-width:300px;'>"
#         title += f"<b>{node}</b><br>"
#         title += f"Type: {node_type}<br>"
        
#         if node_type == 'file':
#             title += f"Directory: {node_data.get('directory', 'N/A')}<br>"
            
#         elif node_type == 'symbol':
#             title += f"Symbol type: {node_data.get('symbol_type', 'N/A')}<br>"
#             title += f"Line: {node_data.get('line_number', 'N/A')}<br>"
            
#             if node_data.get('docstring'):
#                 docstring = node_data['docstring']
#                 if len(docstring) > 200:
#                     docstring = docstring[:197] + "..."
#                 title += f"Docstring: {docstring}<br>"
                
#         elif node_type == 'snippet':
#             title += f"Lines: {node_data.get('start_line', 'N/A')}-{node_data.get('end_line', 'N/A')}<br>"
            
#             if node_data.get('code_snippet'):
#                 snippet = node_data['code_snippet'].replace('\n', '<br>')
#                 if len(snippet) > 300:
#                     snippet = snippet[:297] + "..."
#                 title += f"Code:<br><pre>{snippet}</pre>"
                
#         title += "</div>"
        
#         # Determine node size based on type and connections
#         size = 15  # Default size
#         if node_type == 'directory':
#             size = 25
#         elif node_type == 'file':
#             size = 20
#         elif node_type == 'symbol' and node_data.get('symbol_type') == 'class':
#             size = 18
        
#         # Add node with appropriate styling
#         net.add_node(
#             node, 
#             label=label, 
#             title=title,
#             color=node_colors.get(node_type, '#999999'),
#             size=size,
#             shape='dot' if node_type != 'directory' else 'diamond'
#         )
    
#     # Add edges with appropriate styles
#     for source, target, edge_data in graph.edges(data=True):
#         edge_type = edge_data.get('edge_type', 'unknown')
        
#         # Style edges differently based on type
#         if edge_type == 'import':
#             color = '#f5f542'
#             width = 2
#             dashes = False
#         elif edge_type == 'references':
#             color = '#f54242'
#             width = 1
#             dashes = [5, 5]
#         elif edge_type == 'defines':
#             color = '#42f55a'
#             width = 3
#             dashes = False
#         elif edge_type == 'contains_snippet':
#             color = '#42c8f5'
#             width = 1
#             dashes = [2, 2]
#         else:
#             color = '#999999'
#             width = 1
#             dashes = False
        
#         # Create hover title with edge details
#         title = f"<div><b>{edge_type}</b><br>"
        
#         if edge_data.get('line_number'):
#             title += f"Line: {edge_data['line_number']}<br>"
            
#         if edge_data.get('context'):
#             context = edge_data['context']
#             if len(context) > 200:
#                 context = context[:197] + "..."
#             title += f"Context: {context}"
            
#         title += "</div>"
        
#         # Add edge with styling
#         net.add_edge(
#             source, 
#             target, 
#             title=title,
#             color=color,
#             width=width,
#             dashes=dashes
#         )
    
#     # Enable physics, navigation and interaction options
#     net.toggle_physics(True)
#     net.show_buttons(filter_=['physics'])
    
#     # Save the visualization
#     net.save_graph(output_html)
#     print(f"Graph visualization saved to {output_html}")
    
#     return output_html

# def visualize_symbol_subgraph(visualizer, symbol_name, output_html="symbol_graph.html"):
#     """
#     Create a focused visualization of a symbol and its relationships.
    
#     Args:
#         visualizer: Initialized CodebaseVisualizer instance
#         symbol_name: Name of the symbol to visualize
#         output_html: Output HTML file for the visualization
#     """
#     # Get the full graph
#     full_graph = visualizer.graph
    
#     # Find all nodes related to this symbol
#     symbol_nodes = [node for node in full_graph.nodes() if f"::{symbol_name}" in node]
    
#     if not symbol_nodes:
#         print(f"Symbol '{symbol_name}' not found in the graph.")
#         return None
    
#     # Get nodes that are connected to symbol nodes (1-hop neighborhood)
#     related_nodes = set(symbol_nodes)
#     for node in symbol_nodes:
#         # Add predecessors (nodes that reference this symbol)
#         related_nodes.update(full_graph.predecessors(node))
#         # Add successors (nodes that this symbol references)
#         related_nodes.update(full_graph.successors(node))
    
#     # Create a subgraph
#     subgraph = full_graph.subgraph(related_nodes)
    
#     # Create a pyvis network
#     net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")
#     net.barnes_hut(gravity=-2000, central_gravity=0.3, spring_length=150)
    
#     # Node colors by type
#     node_colors = {
#         'file': '#4287f5',
#         'directory': '#42f5a7',
#         'symbol': '#f542cb',
#         'snippet': '#f5a742'
#     }
    
#     # Add nodes with appropriate styles
#     for node, node_data in subgraph.nodes(data=True):
#         node_type = node_data.get('type', 'unknown')
#         label = os.path.basename(node) if '/' in node else node
        
#         # Make the focus symbol nodes larger and highlighted
#         if node in symbol_nodes:
#             size = 30
#             color = '#ff0000'  # Bright red for focus
#         else:
#             size = 15
#             color = node_colors.get(node_type, '#999999')
        
#         # Add hover information
#         title = f"<div style='max-width:300px;'>"
#         title += f"<b>{node}</b><br>"
#         title += f"Type: {node_type}<br>"
        
#         if node_type == 'symbol':
#             title += f"Symbol type: {node_data.get('symbol_type', 'N/A')}<br>"
#             if node_data.get('docstring'):
#                 title += f"Docstring: {node_data.get('docstring', '')}<br>"
                
#         title += "</div>"
        
#         # Add node with styling
#         net.add_node(
#             node,
#             label=label,
#             title=title,
#             color=color,
#             size=size
#         )
    
#     # Add edges with styles
#     for source, target, edge_data in subgraph.edges(data=True):
#         edge_type = edge_data.get('edge_type', 'unknown')
        
#         # Style edges by type
#         if edge_type == 'import':
#             color = '#f5f542'  # Yellow
#         elif edge_type == 'references':
#             color = '#f54242'  # Red
#         elif edge_type == 'defines':
#             color = '#42f55a'  # Green
#         else:
#             color = '#999999'  # Gray
        
#         # Add edge with details in hover
#         title = f"{edge_type}"
#         if edge_data.get('line_number'):
#             title += f" (line {edge_data['line_number']})"
            
#         net.add_edge(source, target, title=title, color=color)
    
#     # Enable physics and navigation
#     net.toggle_physics(True)
#     net.show_buttons(filter_=['physics'])
    
#     # Save the visualization
#     net.save_graph(output_html)
#     print(f"Symbol graph visualization saved to {output_html}")
    
#     return output_html


In [None]:
# # Visualize the entire codebase graph (with node limit for performance)
# visualize_codebase_graph("directory_name", limit_nodes=200)

# # Visualize a specific symbol (like "symbol_name")
# visualizer = CodebaseVisualizer("directory_name")
# visualizer.parse_files()
# G = visualizer.build_graph()