# Scopium : Expanding the scope of your codebase

### Features of Scopium:
- Automatic chunking and storing of the entire codebase based on relationship between files(imports, directory levels, symbol types)  
- Efficient retrieving system performed with a hybrid approach in mind

### Dataset - The codebase needed by the user
- When the root directory of the codebase is given as the argument, it converts it to a networkx graph capturing all the mentioned relationship between codes. 
- This is then loaded to an arangoDB

### Step0 - Installs and imports

In [None]:
!pip install nx-arangodb
!pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com # Requires CUDA-capable GPU
!pip install --upgrade langchain langchain-community langchain-openai langgraph langchain_mistralai
!pip install networkx==3.4
!pip install tree-sitter
!git clone https://github.com/tree-sitter/tree-sitter-cpp

#### Imports:

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import random
import os
import ast
from typing import Dict, Set, List, Tuple, Optional,Any
import json
from arango import ArangoClient
import nx_arangodb as nxadb
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool
from langchain_mistralai import ChatMistralAI
import glob
import re

#### Building the graph - 
- This code takes the root directory of the codebase as the input. 
- It then builds the graph based on the mentioned features and storing it in appropriate nodes and edges.

In [None]:
# Define global data structures
def initialize_data_structures():
    data = {
        'root_dir': '',
        'graph': nx.DiGraph(),
        'file_contents': {},  # file -> content
        'import_relations': {},  # file -> [(module, line_no)]
        'module_symbols': {},  # file -> {symbol -> {type, line_no, context}}
        'symbol_references': {},  # symbol -> [(file, line_no, context)]
        'file_index': {},  # Maps files to indices
        'current_index': 0,
        'directories': set(),
        'symbol_index': {},  # symbol -> [{file, type, line_no, context}]
        'supported_languages': ["python", "cpp", "java", "go"],
        'language_extensions': {
            "python": [".py"],
            "cpp": [".c", ".cpp", ".h", ".hpp", ".cc", ".cxx", ".hxx"],
            "java": [".java"],
            "go": [".go"]
        }
    }
    return data

In [None]:
def get_next_index(data):
    """Get next available index for file indexing."""
    data['current_index'] += 1
    return data['current_index']

def chunk_code(code, lines_per_chunk=20):
    """
    Chunk the given code into snippets.
    Returns a list of dictionaries with 'code_snippet', 'start_line', and 'end_line'.
    """
    lines = code.splitlines()
    chunks = []
    for i in range(0, len(lines), lines_per_chunk):
        chunk_lines = lines[i:i + lines_per_chunk]
        chunk = {
            'code_snippet': '\n'.join(chunk_lines),
            'start_line': i + 1,
            'end_line': i + len(chunk_lines)
        }
        chunks.append(chunk)
    return chunks

def get_context_around_line(data, file_path, line_no, context_lines=3):
    """Extract context around a specific line in a file."""
    if file_path not in data['file_contents']:
        return ""
    
    lines = data['file_contents'][file_path].splitlines()
    start = max(0, line_no - context_lines - 1)
    end = min(len(lines), line_no + context_lines)
    
    context = "\n".join(lines[start:end])
    return context

def detect_language(data, file_path):
    """Detect the programming language of a file based on its extension."""
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    for language, extensions in data['language_extensions'].items():
        if ext in extensions:
            return language
            
    return "unknown"

In [None]:
def extract_python_node_source(source, node):
    """Extract the source code for a Python AST node."""
    try:
        lines = source.splitlines()
        if hasattr(node, 'lineno') and hasattr(node, 'end_lineno'):
            start = node.lineno - 1
            end = getattr(node, 'end_lineno', start + 1)
            return '\n'.join(lines[start:end])
        return ""
    except Exception:
        return ""

def analyze_python_file(data, file_path, content):
    """Analyze a Python file for imports and symbols."""
    try:
        tree = ast.parse(content)
        imports = []
        symbols = {}

        for node in ast.walk(tree):
            # Track imports
            if isinstance(node, (ast.Import, ast.ImportFrom)):
                if isinstance(node, ast.Import):
                    for name in node.names:
                        imports.append((name.name, node.lineno))
                else:  # ImportFrom
                    module = node.module if node.module else ''
                    for name in node.names:
                        imports.append((f"{module}.{name.name}" if module else name.name, node.lineno))

            # Track defined symbols with line numbers and context
            elif isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Assign)):
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    symbol_name = node.name
                    symbol_type = 'class' if isinstance(node, ast.ClassDef) else 'function'
                    line_no = node.lineno
                    context = extract_python_node_source(content, node)
                    
                    symbols[symbol_name] = {
                        'type': symbol_type,
                        'line_no': line_no,
                        'context': context,
                        'docstring': ast.get_docstring(node)
                    }
                elif isinstance(node, ast.Assign):
                    # Handle variable assignments
                    for target in node.targets:
                        if isinstance(target, ast.Name):
                            symbol_name = target.id
                            line_no = node.lineno
                            context = extract_python_node_source(content, node)
                            
                            symbols[symbol_name] = {
                                'type': 'variable',
                                'line_no': line_no,
                                'context': context
                            }

        data['import_relations'][file_path] = imports
        data['module_symbols'][file_path] = symbols

    except Exception as e:
        print(f"Error analyzing Python file {file_path}: {e}")

In [None]:
def analyze_cpp_file(data, file_path, content):
    """Analyze a C/C++ file for includes and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for C/C++ code analysis
    include_pattern = re.compile(r'#include\s+[<"]([^>"]+)[>"]')
    class_pattern = re.compile(r'(?:class|struct)\s+(\w+)')
    function_pattern = re.compile(r'(\w+)\s*\([^)]*\)\s*(?:const|override|final|noexcept)?\s*(?:{|;)')
    namespace_pattern = re.compile(r'namespace\s+(\w+)')
    
    for line_no, line in enumerate(lines, 1):
        # Find include statements
        include_match = include_pattern.search(line)
        if include_match:
            imports.append((include_match.group(1), line_no))
        
        # Find class/struct definitions
        class_match = class_pattern.search(line)
        if class_match:
            class_name = class_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[class_name] = {
                'type': 'class',
                'line_no': line_no,
                'context': context
            }
        
        # Find function definitions (simplified)
        function_match = function_pattern.search(line)
        if function_match and not line.strip().startswith('#') and not line.strip().startswith('//'):
            function_name = function_match.group(1)
            # Skip some common keywords that might be mistaken for functions
            if function_name not in ['if', 'while', 'for', 'switch', 'return']:
                context = get_context_around_line(data, file_path, line_no)
                symbols[function_name] = {
                    'type': 'function',
                    'line_no': line_no,
                    'context': context
                }
        
        # Find namespace definitions
        namespace_match = namespace_pattern.search(line)
        if namespace_match:
            namespace_name = namespace_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[namespace_name] = {
                'type': 'namespace',
                'line_no': line_no,
                'context': context
            }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

def analyze_java_file(data, file_path, content):
    """Analyze a Java file for imports and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for Java code analysis
    package_pattern = re.compile(r'package\s+([\w.]+)')
    import_pattern = re.compile(r'import\s+([\w.]+(?:\.\*)?)')
    class_pattern = re.compile(r'(?:public|private|protected)?\s*(?:abstract|final)?\s*class\s+(\w+)')
    interface_pattern = re.compile(r'(?:public|private|protected)?\s*interface\s+(\w+)')
    method_pattern = re.compile(r'(?:public|private|protected)?\s*(?:static|final|abstract)?\s*(?:[\w<>[\],\s]+)\s+(\w+)\s*\([^)]*\)')
    
    for line_no, line in enumerate(lines, 1):
        # Find package declaration
        package_match = package_pattern.search(line)
        if package_match:
            package_name = package_match.group(1)
            imports.append((package_name, line_no))
        
        # Find import statements
        import_match = import_pattern.search(line)
        if import_match:
            import_name = import_match.group(1)
            imports.append((import_name, line_no))
        
        # Find class definitions
        class_match = class_pattern.search(line)
        if class_match:
            class_name = class_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[class_name] = {
                'type': 'class',
                'line_no': line_no,
                'context': context
            }
        
        # Find interface definitions
        interface_match = interface_pattern.search(line)
        if interface_match:
            interface_name = interface_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[interface_name] = {
                'type': 'interface',
                'line_no': line_no,
                'context': context
            }
        
        # Find method definitions
        method_match = method_pattern.search(line)
        if method_match:
            method_name = method_match.group(1)
            # Skip some common keywords that might be mistaken for methods
            if method_name not in ['if', 'while', 'for', 'switch', 'return']:
                context = get_context_around_line(data, file_path, line_no)
                symbols[method_name] = {
                    'type': 'method',
                    'line_no': line_no,
                    'context': context
                }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

def analyze_go_file(data, file_path, content):
    """Analyze a Go file for imports and symbols."""
    imports = []
    symbols = {}
    
    # Process content line by line
    lines = content.splitlines()
    
    # Regular expressions for Go code analysis
    package_pattern = re.compile(r'package\s+(\w+)')
    import_single_pattern = re.compile(r'import\s+"([^"]+)"')
    import_multi_start_pattern = re.compile(r'import\s+\(')
    import_multi_line_pattern = re.compile(r'\s*"([^"]+)"')
    func_pattern = re.compile(r'func\s+(?:\([^)]+\)\s+)?(\w+)')
    struct_pattern = re.compile(r'type\s+(\w+)\s+struct')
    interface_pattern = re.compile(r'type\s+(\w+)\s+interface')
    
    in_import_block = False
    
    for line_no, line in enumerate(lines, 1):
        # Find package declaration
        package_match = package_pattern.search(line)
        if package_match:
            package_name = package_match.group(1)
            imports.append((f"package {package_name}", line_no))
        
        # Handle single-line imports
        import_match = import_single_pattern.search(line)
        if import_match:
            import_name = import_match.group(1)
            imports.append((import_name, line_no))
        
        # Handle multi-line imports
        if import_multi_start_pattern.search(line):
            in_import_block = True
            continue
        
        if in_import_block:
            if line.strip() == ')':
                in_import_block = False
                continue
                
            import_line_match = import_multi_line_pattern.search(line)
            if import_line_match:
                import_name = import_line_match.group(1)
                imports.append((import_name, line_no))
        
        # Find function definitions
        func_match = func_pattern.search(line)
        if func_match:
            func_name = func_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[func_name] = {
                'type': 'function',
                'line_no': line_no,
                'context': context
            }
        
        # Find struct definitions
        struct_match = struct_pattern.search(line)
        if struct_match:
            struct_name = struct_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[struct_name] = {
                'type': 'struct',
                'line_no': line_no,
                'context': context
            }
        
        # Find interface definitions
        interface_match = interface_pattern.search(line)
        if interface_match:
            interface_name = interface_match.group(1)
            context = get_context_around_line(data, file_path, line_no)
            symbols[interface_name] = {
                'type': 'interface',
                'line_no': line_no,
                'context': context
            }
    
    data['import_relations'][file_path] = imports
    data['module_symbols'][file_path] = symbols

In [None]:
def analyze_file(data, file_path, content, language):
    """Analyze a file for imports and symbols with line numbers and context."""
    if language == "python":
        analyze_python_file(data, file_path, content)
    elif language == "cpp":
        analyze_cpp_file(data, file_path, content)
    elif language == "java":
        analyze_java_file(data, file_path, content)
    elif language == "go":
        analyze_go_file(data, file_path, content)

In [None]:
def find_references_in_python_file(data, file_path, content):
    """Find references to symbols in a Python file."""
    try:
        tree = ast.parse(content)
        
        for node in ast.walk(tree):
            # Find variable references
            if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
                symbol_name = node.id
                line_no = node.lineno
                
                # Track reference with context
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))
            
            # Find attribute references (e.g., obj.method())
            elif isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
                attr_name = node.attr
                line_no = node.lineno
                
                if attr_name not in data['symbol_references']:
                    data['symbol_references'][attr_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][attr_name].append((file_path, line_no, context))
    
    except Exception as e:
        print(f"Error finding references in Python file {file_path}: {e}")

def find_references_in_go_file(data, file_path, content):
    """Find references to symbols in a Go file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Only create patterns for symbols with reasonable length (avoid single-character symbols)
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines and import/package declarations
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("import ") or 
            line.strip().startswith("package ")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))


def find_references_in_cpp_file(data, file_path, content):
    """Find references to symbols in a C/C++ file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns for symbols with meaningful length
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Skip very short symbols that would cause many false positives
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines and preprocessor directives
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("#")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))


def find_references_in_java_file(data, file_path, content):
    """Find references to symbols in a Java file with optimized performance."""
    # Get all symbol names from all files to check for references
    all_symbols = set()
    for symbols_dict in data['module_symbols'].values():
        all_symbols.update(symbols_dict.keys())
    
    # Skip if no symbols to check or file is empty
    if not all_symbols or not content:
        return
    
    # Pre-compile all regex patterns for symbols with meaningful length
    symbol_patterns = {}
    for symbol_name in all_symbols:
        # Skip very short symbols that would cause many false positives
        if len(symbol_name) > 2:
            symbol_patterns[symbol_name] = re.compile(r'\b' + re.escape(symbol_name) + r'\b')
    
    # Process content line by line
    lines = content.splitlines()
    
    # Skip definition lines for this file
    definition_lines = {}
    if file_path in data['module_symbols']:
        for symbol, details in data['module_symbols'][file_path].items():
            definition_lines[details['line_no']] = symbol
    
    for line_no, line in enumerate(lines, 1):
        # Skip comment lines, imports, and package declarations
        if (line.strip().startswith("//") or 
            line.strip().startswith("/*") or 
            line.strip().startswith("import ") or 
            line.strip().startswith("package ")):
            continue
        
        # Skip if this line is a symbol definition
        if line_no in definition_lines:
            continue
        
        # Check for symbol references
        for symbol_name, pattern in symbol_patterns.items():
            if pattern.search(line):
                # Skip if this is a definition line for this symbol
                if (file_path in data['module_symbols'] and 
                    symbol_name in data['module_symbols'][file_path] and 
                    data['module_symbols'][file_path][symbol_name]['line_no'] == line_no):
                    continue
                
                if symbol_name not in data['symbol_references']:
                    data['symbol_references'][symbol_name] = []
                
                context = get_context_around_line(data, file_path, line_no)
                data['symbol_references'][symbol_name].append((file_path, line_no, context))

def find_references_in_file(data, file_path, content, language):
    """Find references to symbols in a file based on its language."""
    if language == "python":
        find_references_in_python_file(data, file_path, content)
    elif language == "cpp":
        find_references_in_cpp_file(data, file_path, content)
    elif language == "java":
        find_references_in_java_file(data, file_path, content)
    elif language == "go":
        find_references_in_go_file(data, file_path, content)

In [None]:
def build_symbol_index(data):
    """Build a comprehensive index of all symbols and where they're defined/used."""
    # Initialize the symbol index
    data['symbol_index'] = {}
    
    # First, add all symbol definitions
    for file_path, symbols in data['module_symbols'].items():
        for symbol_name, details in symbols.items():
            if symbol_name not in data['symbol_index']:
                data['symbol_index'][symbol_name] = []
            
            data['symbol_index'][symbol_name].append({
                'file': file_path,
                'type': 'definition',
                'symbol_type': details['type'],
                'line_no': details['line_no'],
                'context': details.get('context', ''),
                'docstring': details.get('docstring', '')
            })
    
    # Then, add all references
    for symbol_name, references in data['symbol_references'].items():
        if symbol_name not in data['symbol_index']:
            data['symbol_index'][symbol_name] = []
        
        for file_path, line_no, context in references:
            # Avoid duplicating references if they're already in definitions
            if not any(ref['file'] == file_path and ref['line_no'] == line_no and ref['type'] == 'definition' 
                      for ref in data['symbol_index'].get(symbol_name, [])):
                data['symbol_index'][symbol_name].append({
                    'file': file_path,
                    'type': 'reference',
                    'line_no': line_no,
                    'context': context
                })

def parse_files(data):
    """Parse all files in the directory and build relationships."""
    # First pass: Index all files and create directory nodes
    for root, dirs, files in os.walk(data['root_dir']):
        # Add directory node
        rel_dir = os.path.relpath(root, data['root_dir'])
        if rel_dir != '.':
            data['directories'].add(rel_dir)
            data['graph'].add_node(rel_dir, type='directory')
            
            # Add edge from parent directory to this directory (if not root)
            parent_dir = os.path.dirname(rel_dir)
            if parent_dir and parent_dir != '.':
                data['graph'].add_edge(parent_dir, rel_dir, edge_type='contains_directory')

        # Index files of supported languages
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, data['root_dir'])
            file_language = detect_language(data, file_path)
            
            if file_language in data['supported_languages']:
                data['file_index'][rel_path] = get_next_index(data)
                
                # Add node for this file
                data['graph'].add_node(rel_path, type='file', file_index=data['file_index'][rel_path], language=file_language)
                
                # Connect file to its directory
                if rel_dir != '.':
                    data['graph'].add_edge(rel_dir, rel_path, edge_type='contains_file')
                
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        data['file_contents'][rel_path] = content
                        analyze_file(data, rel_path, content, file_language)
                except Exception as e:
                    print(f"Error parsing {file_path}: {e}")
    
    # Second pass: Find symbol references across files
    for file_path, content in data['file_contents'].items():
        file_language = detect_language(data, file_path)
        find_references_in_file(data, file_path, content, file_language)
    
    # Build the symbol index after all analyses
    build_symbol_index(data)

In [None]:
def build_graph(data):
    """Build the NetworkX graph with enhanced node and edge information."""
    # We've already added basic file and directory nodes during parsing
    # Now add more detailed connections and data
    
    # Add nodes for all directories (if not already added)
    for directory in data['directories']:
        if not data['graph'].has_node(directory):
            data['graph'].add_node(directory, type='directory')
        
        # Ensure parent directories exist and are connected
        parts = directory.split(os.sep)
        for i in range(1, len(parts)):
            parent_path = os.sep.join(parts[:i])
            if parent_path and not data['graph'].has_node(parent_path):
                data['graph'].add_node(parent_path, type='directory')
                data['directories'].add(parent_path)
            
            # Connect parent to child directory
            if parent_path:
                child_path = os.sep.join(parts[:i+1])
                data['graph'].add_edge(parent_path, child_path, edge_type='contains_directory')
    
    # Add nodes for all files with indices and code snippet nodes
    for file_path, file_idx in data['file_index'].items():
        language = detect_language(data, file_path)
        
        # Update file node if it exists, create it otherwise
        if data['graph'].has_node(file_path):
            data['graph'].nodes[file_path].update({
                'file_index': file_idx,
                'directory': os.path.dirname(file_path),
                'language': language
            })
        else:
            data['graph'].add_node(file_path, 
                               type='file',
                               file_index=file_idx,
                               directory=os.path.dirname(file_path),
                               language=language)
        
        # Connect file to its directory
        directory = os.path.dirname(file_path)
        if directory:
            # Make sure the directory node exists
            if not data['graph'].has_node(directory):
                data['graph'].add_node(directory, type='directory')
                data['directories'].add(directory)
            
            # Add edge from directory to file if it doesn't exist
            if not data['graph'].has_edge(directory, file_path):
                data['graph'].add_edge(directory, file_path, edge_type='contains_file')
        
        # Create snippet nodes for the entire file
        if file_path in data['file_contents']:
            chunks = chunk_code(data['file_contents'][file_path])
            for idx, chunk_info in enumerate(chunks):
                snippet_node = f"{file_path}::snippet::{idx}"
                data['graph'].add_node(snippet_node,
                                   type='snippet',
                                   code_snippet=chunk_info['code_snippet'],
                                   start_line=chunk_info['start_line'],
                                   end_line=chunk_info['end_line'],
                                   language=language)
                # Connect file node to snippet node
                data['graph'].add_edge(file_path, snippet_node, 
                                   edge_type='contains_snippet',
                                   start_line=chunk_info['start_line'],
                                   end_line=chunk_info['end_line'])

        # Add nodes for symbols in this file
        for symbol, details in data['module_symbols'].get(file_path, {}).items():
            symbol_node = f"{file_path}::{symbol}"
            data['graph'].add_node(symbol_node, 
                               type='symbol',
                               symbol_type=details['type'],
                               line_number=details['line_no'],
                               context=details.get('context', ''),
                               docstring=details.get('docstring', ''))
            data['graph'].add_edge(file_path, symbol_node, 
                               edge_type='defines',
                               line_number=details['line_no'])

    # Add edges for imports with line numbers
    for file_path, imports in data['import_relations'].items():
        for imp, line_no in imports:
            # Look for matching files or symbols
            for target_file, symbols in data['module_symbols'].items():
                if imp in symbols:
                    data['graph'].add_edge(file_path, 
                                       f"{target_file}::{imp}",
                                       edge_type='import',
                                       line_number=line_no)
                # For Python, handle module imports
                elif detect_language(data, file_path) == "python" and target_file.replace('.py', '').endswith(imp):
                    data['graph'].add_edge(file_path, 
                                       target_file,
                                       edge_type='import',
                                       line_number=line_no)
                # For Java, handle package imports
                elif detect_language(data, file_path) == "java" and imp.startswith(os.path.splitext(os.path.basename(target_file))[0]):
                    data['graph'].add_edge(file_path, 
                                       target_file,
                                       edge_type='import',
                                       line_number=line_no)
    
    # Add edges for symbol references
    for symbol, references in data['symbol_references'].items():
        for file_path, line_no, context in references:
            # Find symbol nodes that match this reference
            for target_file, symbols in data['module_symbols'].items():
                if symbol in symbols:
                    # Create reference edge
                    data['graph'].add_edge(file_path, 
                                       f"{target_file}::{symbol}",
                                       edge_type='references',
                                       line_number=line_no,
                                       context=context)
    
    return data['graph']

In [None]:
def validate_graph_and_data(data, G):
    """Validate the graph and data structures for consistency and coverage."""
    report = {
        'stats': {
            'files': len(data['file_index']),
            'directories': len(data['directories']),
            'symbols': len(data['symbol_index']),
            'nodes': len(G.nodes()),
            'edges': len(G.edges())
        },
        'issues': []
    }
    
    # Check that all files in file_index have corresponding nodes
    for file_path in data['file_index']:
        if not G.has_node(file_path):
            report['issues'].append(f"File {file_path} in index but missing from graph")
    
    # Check that all directories have nodes
    for directory in data['directories']:
        if not G.has_node(directory):
            report['issues'].append(f"Directory {directory} in data but missing from graph")
    
    # Check symbol connections
    for symbol, entries in data['symbol_index'].items():
        definition_files = [entry['file'] for entry in entries if entry['type'] == 'definition']
        for def_file in definition_files:
            symbol_node = f"{def_file}::{symbol}"
            if not G.has_node(symbol_node):
                report['issues'].append(f"Symbol {symbol} defined in {def_file} but node missing from graph")
    
    # Count symbols by type
    symbol_types = {}
    for entries in data['symbol_index'].values():
        for entry in entries:
            if entry['type'] == 'definition' and 'symbol_type' in entry:
                symbol_type = entry['symbol_type']
                if symbol_type not in symbol_types:
                    symbol_types[symbol_type] = 0
                symbol_types[symbol_type] += 1
    
    report['stats']['symbol_types'] = symbol_types
    
    # Count edge types
    edge_types = {}
    for _, _, attrs in G.edges(data=True):
        edge_type = attrs.get('edge_type', 'unknown')
        if edge_type not in edge_types:
            edge_types[edge_type] = 0
        edge_types[edge_type] += 1
    
    report['stats']['edge_types'] = edge_types
    
    return report

In [None]:
def main(root_directory):
    # Initialize all data structures
    data = initialize_data_structures()
    data['root_dir'] = root_directory
    
    print(f"Starting to parse files in {root_directory}...")
    
    # Parse all the files in the directory
    parse_files(data)
    print(f"Parsed {len(data['file_index'])} files")
    
    # Build the graph representation
    G = build_graph(data)
    print(f"Graph has {len(G.nodes())} nodes and {len(G.edges())} edges")
    
    # Validate the graph and data
    report = validate_graph_and_data(data, G)
    print(json.dumps(report, indent=2))
    
    return data, G

In [None]:
root_dir = "flask"

data, G = main(root_dir)

### Make Database


In [None]:

db = ArangoClient(hosts="https://d2eeb8083350.arangodb.cloud:8529").db(username="root", password="cUZ0YaNdcwfUTw6VjRny", verify=True)

print(db)

In [None]:

G_adb = nxadb.Graph(
    name="FlaskRepv1",
    db=db,
    incoming_graph_data=G,
    write_batch_size=50000, # feel free to modify
    overwrite_graph=True
)

print(G_adb)

### Run from loaded database

In [None]:
db = ArangoClient(hosts="https://d2eeb8083350.arangodb.cloud:8529").db(username="root", password="cUZ0YaNdcwfUTw6VjRny", verify=True)

print(db)

<StandardDatabase _system>


In [None]:
G_adb = nxadb.Graph(
    name="FlaskRepv1",
    db=db,
    #incoming_graph_data=G,
    #write_batch_size=50000 # feel free to modify
)

print(type(G_adb))

[20:27:38 +0530] [INFO]: Graph 'FlaskRepv1' exists.
[20:27:39 +0530] [INFO]: Default node type set to 'FlaskRepv1_node'


<class 'nx_arangodb.classes.graph.Graph'>


In [None]:
arango_graph = ArangoGraph(db)

In [None]:
print( arango_graph.schema )

{'Graph Schema': [{'graph_name': 'CodebaseGraph', 'edge_definitions': [{'edge_collection': 'CodebaseGraph_node_to_CodebaseGraph_node', 'from_vertex_collections': ['CodebaseGraph_node'], 'to_vertex_collections': ['CodebaseGraph_node']}]}, {'graph_name': 'FlaskRepv1_node_to_FlaskRespv1_node', 'edge_definitions': [{'edge_collection': 'FlaskRepv1_node_to_FlaskRespv1_node_node_to_FlaskRepv1_node_to_FlaskRespv1_node_node', 'from_vertex_collections': ['FlaskRepv1_node_to_FlaskRespv1_node_node'], 'to_vertex_collections': ['FlaskRepv1_node_to_FlaskRespv1_node_node']}]}, {'graph_name': 'FlaskRepv1_node', 'edge_definitions': [{'edge_collection': 'FlaskRepv1_node_node_to_FlaskRepv1_node_node', 'from_vertex_collections': ['FlaskRepv1_node_node'], 'to_vertex_collections': ['FlaskRepv1_node_node']}]}, {'graph_name': 'code_graph', 'edge_definitions': [{'edge_collection': 'code_edges', 'from_vertex_collections': ['code_nodes'], 'to_vertex_collections': ['code_nodes']}]}, {'graph_name': 'FlaskRespv1', '