In [1]:
import os
import shutil
from pathlib import Path
import git
repo_url = "https://github.com/langchain-ai/langgraph"
repo_dir = "./langgraph"


In [2]:
import pandas as pd
import re
from pathlib import Path
import os
import shutil
import git

# Markdown element prefixes
HEADING_PREFIXES = ['#', '##', '###', '####', '#####', '######']
NEWLINE = '\n'

def get_markdown_elements(filepath):
    """
    Extract structured elements (e.g., headings, paragraphs) from a Markdown (.md) file.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        all_lines = file.read().split(NEWLINE)
        for line in all_lines:
            line = line.strip()
            if line:  # Skip empty lines
                if any(line.startswith(prefix) for prefix in HEADING_PREFIXES):
                    # Extract heading level and text
                    heading_level = len(line.split(' ')[0])
                    heading_text = line[heading_level:].strip()
                    yield {
                        'type': 'heading',
                        'level': heading_level,
                        'content': heading_text,
                        'filepath': filepath,
                    }
                else:
                    # Treat non-heading lines as paragraph content
                    yield {
                        'type': 'paragraph',
                        'content': line,
                        'filepath': filepath,
                    }

def extract_markdown_from_repo(md_root):
    """
    Extract all elements from .md files in the repository.
    """
    md_files = list(md_root.glob('**/*.md'))

    num_files = len(md_files)
    print(f'Total number of .md files: {num_files}')

    if num_files == 0:
        print('Verify the repository exists and md_root is set correctly.')
        return None

    all_elements = [
        element
        for md_file in md_files
        for element in get_markdown_elements(md_file)
    ]

    num_elements = len(all_elements)
    print(f'Total number of elements extracted: {num_elements}')

    return all_elements

def identify_library(content):
    """
    Identify libraries mentioned in the Markdown content.
    Example: Looks for patterns like 'import <library>' or mentions like '[<library>]'
    """
    # Define patterns to detect library mentions
    library_patterns = [
        r'^\s*import\s+(\w+)',               # Matches 'import library_name'
        r'^\s*from\s+([\w\.]+)\s+import',    # Matches 'from library_name.submodule import'
    ]

    for pattern in library_patterns:
        match = re.search(pattern, content)
        if match:
            library_name = match.group(1)  # Capture the matched library name
            
            # If the library name contains a dot, take the part before the dot (top-level library)
            top_level_library = library_name.split('.')[0] 
            return top_level_library  # Return the top-level library name

    return None

def add_library_column(records):
    """
    Adds a 'library' column to the records by scanning the content for library mentions.
    """
    for record in records:
        record['library'] = identify_library(record['content'])
    return records


In [3]:
def clone_repo(repo_url, repo_dir):
    """
    Clone a GitHub repository with basic error handling.
    
    Args:
        repo_url (str): URL of the GitHub repository to clone
        repo_dir (str): Local directory to clone the repository into
    
    Raises:
        ValueError: If repo_url is invalid
        git.exc.GitCommandError: If cloning fails
    """
    # Validate input
    if not repo_url or not isinstance(repo_url, str):
        raise ValueError("Invalid repository URL")
    
    # Ensure clean destination
    try:
        if os.path.exists(repo_dir):
            shutil.rmtree(repo_dir)
        os.makedirs(repo_dir, exist_ok=True)
        
        # Clone repository
        print(f"Cloning repository from {repo_url} to {repo_dir}")
        repo = git.Repo.clone_from(repo_url, repo_dir)
        print("Repository cloned successfully")
        return repo
    
    except git.exc.GitCommandError as e:
        print(f"Git clone failed: {e}")
        raise
    except PermissionError:
        print("Permission denied when creating or accessing directory")
        raise

def process_github_repo(repo_url, repo_dir):
    """
    Comprehensive GitHub repository processing function.
    
    Args:
        repo_url (str): URL of the GitHub repository
        repo_dir (str): Local directory to clone and process the repository
    
    Returns:
        DataFrame or None: Processed repository data
    """
    try:
        # Validate inputs
        if not repo_url or not repo_dir:
            print("Invalid repository URL or directory")
            return None

        # Clone repository
        try:
            clone_repo(repo_url, repo_dir)
        except (ValueError, git.exc.GitCommandError, PermissionError) as clone_error:
            print(f"Repository cloning failed: {clone_error}")
            return None

        # Convert to Path object for consistency
        repo_dir_path = Path(repo_dir)

        # Extract structures
        try:
            print("Extracting structures from the repository...")
            results = extract_markdown_from_repo(repo_dir_path)
            
            if results is None or len(results) == 0:
                print("No structures extracted from the repository")
                return None
            
            # Add library column
            results_with_libraries = add_library_column(results)
            return results_with_libraries
        
        except Exception as extract_error:
            print(f"Error extracting structures: {extract_error}")
            return None

    except Exception as unexpected_error:
        print(f"Unexpected error in repository processing: {unexpected_error}")
        return None
    
    finally:
        # Cleanup - Always attempt to remove the repository directory
        try:
            if os.path.exists(repo_dir):
                shutil.rmtree(repo_dir)
                print("Cloned repository directory cleaned up")
        except Exception as cleanup_error:
            print(f"Error during cleanup: {cleanup_error}")


In [4]:
all_functions=process_github_repo(repo_url, repo_dir)

Cloning repository from https://github.com/langchain-ai/langgraph to ./langgraph
Repository cloned successfully
Extracting structures from the repository...
Total number of .md files: 102
Total number of elements extracted: 11040
Cloned repository directory cleaned up


In [5]:
df = pd.DataFrame(all_functions)
data = df.to_dict('records')
df.head()

Unnamed: 0,type,level,content,filepath,library
0,heading,1.0,🦜🕸️LangGraph,langgraph/README.md,
1,paragraph,,![Version](https://img.shields.io/pypi/v/langg...,langgraph/README.md,
2,paragraph,,[![Downloads](https://static.pepy.tech/badge/l...,langgraph/README.md,
3,paragraph,,[![Open Issues](https://img.shields.io/github/...,langgraph/README.md,
4,paragraph,,[![Docs](https://img.shields.io/badge/docs-lat...,langgraph/README.md,


In [6]:
df['library'].unique()

array([None, 'typing', 'langchain_core', 'langchain_anthropic',
       'langgraph', 'asyncio', 'logging', 'os', 'your_lib',
       'langgraph_sdk', 'langchain_community', 'fastapi',
       'your_agent_package', 'typing_extensions', 'operator',
       'langchain_openai', 'my_agent', 'type', 'httpx'], dtype=object)

In [7]:
df[df['library']=='your_lib']

Unnamed: 0,type,level,content,filepath,library
880,paragraph,,from your_lib import graph # graph expected to...,langgraph/libs/scheduler-kafka/README.md,your_lib
902,paragraph,,from your_lib import graph # graph expected to...,langgraph/libs/scheduler-kafka/README.md,your_lib


In [8]:
df[df['library']=='our']

Unnamed: 0,type,level,content,filepath,library


In [26]:
df[df['library']=='your_agent_package']

Unnamed: 0,type,level,content,filepath,library
1685,paragraph,,from your_agent_package import graph,langgraph/docs/docs/concepts/langgraph_platfor...,your_agent_package


In [77]:
import re

library_patterns = [
    r'^\s*import\s+(\w+)',               # Matches 'import library_name'
    r'^\s*from\s+([\w\.]+)\s+import',    # Matches 'from library_name.submodule import'
]

def extract_libraries(code):
    libraries = set()
    for pattern in library_patterns:
        matches = re.findall(pattern, code, re.MULTILINE)  # Add re.MULTILINE for line-by-line matching
        for match in matches:
            # If there is a dot (.) in the match, only take the part before the dot (top-level library)
            top_level_library = match.split('.')[0]
            libraries.add(top_level_library)
    return list(libraries)

# Example usage
code_snippet = """
import pandas
from numpy import array
from operator import add
import os
import fastapi
from langchain import langgraph
from anggraph.checkpoint.postgres,    # Matches 'from library_name.submodule import'
from my_agent.utils.nodes import call_model
"""

libraries = extract_libraries(code_snippet)
print(libraries)  # Output: ['pandas', 'numpy', 'operator', 'os', 'fastapi', 'langchain']


['os', 'langchain', 'numpy', 'my_agent', 'pandas', 'operator', 'fastapi']
