In [54]:
import json
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd

def convert_html_to_markdown(html_content):
    """
    Converts HTML content in markdown cells to plain markdown and extracts links.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    plain_text = soup.get_text()  # Extract text without HTML tags

    # Extract hyperlinks and format them
    links = []
    for a_tag in soup.find_all('a', href=True):
        links.append(f"[{a_tag.text}]({a_tag['href']})")

    # Combine text and hyperlinks into a markdown-like format
    markdown_content = plain_text.strip() + "\n" + "\n".join(links)
    return markdown_content

def extract_and_format_with_full_markdown_context(file_path):
    """
    Process a single .ipynb file, extracting code cells with surrounding markdown context.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        notebook_data = json.load(f)

    processed_cells = []
    markdown_buffer = []

    # Iterate through cells and process them
    for index, cell in enumerate(notebook_data.get('cells', [])):
        cell_type = cell.get('cell_type')
        source_content = ''.join(cell.get('source', []))

        if cell_type == 'markdown':
            markdown_text = convert_html_to_markdown(source_content)
            markdown_buffer.append(markdown_text)  # Add to the markdown buffer
        elif cell_type == 'code':
            # Determine markdown above and below the code cell
            markdown_above = '\n'.join(markdown_buffer) if markdown_buffer else "Markdown not found"

            # Look ahead to find markdown below the code cell
            markdown_below = "Markdown not found"
            for next_index in range(index + 1, len(notebook_data['cells'])):
                next_cell = notebook_data['cells'][next_index]
                if next_cell['cell_type'] == 'markdown':
                    markdown_below = convert_html_to_markdown(''.join(next_cell.get('source', [])))
                    break
                elif next_cell['cell_type'] == 'code':
                    break

            # Add the processed cell to the list
            processed_cells.append({
                "file_path": str(file_path),
                "file_name": file_path.name,  # Add file name here
                "cell_number": index + 1,  # Cell number (1-indexed)
                "code": source_content,
                "markdown_above": markdown_above,
                "markdown_below": markdown_below
            })

            # Clear the markdown buffer after processing a code cell
            markdown_buffer = []

    return processed_cells

def extract_notebooks_from_repo(repo_path):
    """
    Process all .ipynb files in a GitHub repository.
    """
    repo_path = Path(repo_path)
    notebook_files = list(repo_path.glob('**/*.ipynb'))

    print(f"Found {len(notebook_files)} Jupyter notebooks in the repository.")

    all_cells = []
    for notebook_file in notebook_files:
        processed_cells = extract_and_format_with_full_markdown_context(notebook_file)
        all_cells.extend(processed_cells)

    print(f"Extracted {len(all_cells)} cells from notebooks.")

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(all_cells)
    return df

In [50]:
import os
import shutil
from pathlib import Path
import git
repo_url = "https://github.com/langchain-ai/langgraph"
repo_dir = "./langgraph"


In [55]:
def clone_repo(repo_url, repo_dir):
    """
    Clone a GitHub repository with basic error handling.
    
    Args:
        repo_url (str): URL of the GitHub repository to clone
        repo_dir (str): Local directory to clone the repository into
    
    Raises:
        ValueError: If repo_url is invalid
        git.exc.GitCommandError: If cloning fails
    """
    # Validate input
    if not repo_url or not isinstance(repo_url, str):
        raise ValueError("Invalid repository URL")
    
    # Ensure clean destination
    try:
        if os.path.exists(repo_dir):
            shutil.rmtree(repo_dir)
        os.makedirs(repo_dir, exist_ok=True)
        
        # Clone repository
        print(f"Cloning repository from {repo_url} to {repo_dir}")
        repo = git.Repo.clone_from(repo_url, repo_dir)
        print("Repository cloned successfully")
        return repo
    
    except git.exc.GitCommandError as e:
        print(f"Git clone failed: {e}")
        raise
    except PermissionError:
        print("Permission denied when creating or accessing directory")
        raise

def process_github_repo(repo_url, repo_dir):
    """
    Comprehensive GitHub repository processing function.
    
    Args:
        repo_url (str): URL of the GitHub repository
        repo_dir (str): Local directory to clone and process the repository
    
    Returns:
        DataFrame or None: Processed repository data
    """
    try:
        # Validate inputs
        if not repo_url or not repo_dir:
            print("Invalid repository URL or directory")
            return None

        # Clone repository
        try:
            clone_repo(repo_url, repo_dir)
        except (ValueError, git.exc.GitCommandError, PermissionError) as clone_error:
            print(f"Repository cloning failed: {clone_error}")
            return None

        # Convert to Path object for consistency
        repo_dir_path = Path(repo_dir)

        # Extract structures (assuming extract_structures_from_repo is defined elsewhere)
        try:
            print("Extracting structures from the repository...")
            results = extract_notebooks_from_repo(repo_dir_path)
            
            if results is None or len(results) == 0:
                print("No structures extracted from the repository")
                return None
            
            return results
        
        except Exception as extract_error:
            print(f"Error extracting structures: {extract_error}")
            return None

    except Exception as unexpected_error:
        print(f"Unexpected error in repository processing: {unexpected_error}")
        return None
    
    finally:
        # Cleanup - Always attempt to remove the repository directory
        try:
            if os.path.exists(repo_dir):
                shutil.rmtree(repo_dir)
                print("Cloned repository directory cleaned up")
        except Exception as cleanup_error:
            print(f"Error during cleanup: {cleanup_error}")

In [56]:
all_functions=process_github_repo(repo_url, repo_dir)

Cloning repository from https://github.com/langchain-ai/langgraph to ./langgraph
Repository cloned successfully
Extracting structures from the repository...
Found 165 Jupyter notebooks in the repository.


  soup = BeautifulSoup(html_content, "html.parser")


Extracted 1191 cells from notebooks.
Cloned repository directory cleaned up


In [57]:
df = pd.DataFrame(all_functions)
data = df.to_dict('records')

df.head()

Unnamed: 0,file_path,file_name,cell_number,code,markdown_above,markdown_below
0,langgraph/examples/code_assistant/langgraph_co...,langgraph_code_assistant_mistral.ipynb,2,! pip install -U langchain_community langchain...,# Code generation with self-correction\n\nAlph...,### LLM\n\nWe'll use the Mistral API and `Code...
1,langgraph/examples/code_assistant/langgraph_co...,langgraph_code_assistant_mistral.ipynb,4,"import os\n\nos.environ[""TOKENIZERS_PARALLELIS...",### LLM\n\nWe'll use the Mistral API and `Code...,"### Tracing\n\nOptionally, we'll use LangSmith..."
2,langgraph/examples/code_assistant/langgraph_co...,langgraph_code_assistant_mistral.ipynb,6,"os.environ[""LANGCHAIN_TRACING_V2""] = ""true""\no...","### Tracing\n\nOptionally, we'll use LangSmith...",## Code Generation\n\nTest with structured out...
3,langgraph/examples/code_assistant/langgraph_co...,langgraph_code_assistant_mistral.ipynb,8,# Select LLM\nfrom langchain_core.prompts impo...,## Code Generation\n\nTest with structured out...,Markdown not found
4,langgraph/examples/code_assistant/langgraph_co...,langgraph_code_assistant_mistral.ipynb,9,"question = ""Write a function for fibonacci.""\n...",Markdown not found,Markdown not found


In [16]:
import os
from getpass import getpass

# Prompt the user to enter the OpenAI API key securely
api_key = getpass("Enter your OpenAI API key: ")

# Set the environment variable
os.environ["OPENAI_API_KEY"] = api_key

In [23]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
data = df.to_dict('records')
df.head()
from langchain.schema import Document

documents = [
    Document(
        page_content=item['code'],
        metadata={
            'cell_number': item['cell_number'],
            'code': item['code'],
            'markdown_above': item['markdown_above'],
            'markdown_below': item['markdown_below'],
            'file_name': item['file_name'],
        }
    ) for item in data
]

In [None]:
df['code'].values[0]

'import os\n\nos.environ["TOKENIZERS_PARALLELISM"] = "true"\nmistral_api_key = os.getenv("MISTRAL_API_KEY")  # Ensure this is set'

In [None]:
df['markdown_above'].values

array(['# Code generation with self-correction\n\nAlphaCodium presented an approach for code generation that uses control flow.\n\nMain idea: [construct an answer to a coding question iteratively.](https://x.com/karpathy/status/1748043513156272416?s=20). \n\n[AlphaCodium](https://github.com/Codium-ai/AlphaCodium) iteravely tests and improves an answer on public and AI-generated tests for a particular question. \n\nWe will implement some of these ideas from scratch using [LangGraph](https://langchain-ai.github.io/langgraph/):\n\n1. We show how to route user questions to different types of documentation\n2. We we will show how to perform inline unit tests to confirm imports and code execution work\n3. We will show how to use LangGraph to orchestrate this\n\n![Screenshot 2024-05-23 at 2.17.51 PM.png](attachment:15d3ac32-cdf3-4800-a30c-f26d828d69c8.png)\n',
       "### LLM\n\nWe'll use the Mistral API and `Codestral` instruct model, which support tool use!\n",
       "### Tracing\n\nOption

In [None]:
print(retriever)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate

from langchain_anthropic import ChatAnthropic

system_prompt = """
YYou are an expert coding assistant. Based on the following context, provide a detailed answer including executable Python code:
    Context:
    {context}

    Question:
    {question}

    Make sure your response includes:
    1. A clear explanation of the solution.
    2. Relevant import statements.
    3. A complete code example.

Context:
{context}

Question: {question}
"""

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{question}")
])

# Initialize the Claude 3.5 Sonnet model
anthropic_model = ChatAnthropic(model="claude-3-5-sonnet-latest", temperature=0.2, max_tokens=2048)

qa_chain = RetrievalQA.from_chain_type(
    retriever=retriever,
    llm=anthropic_model,
    chain_type_kwargs={
        "prompt": chat_prompt,
        "document_variable_name": "context"
    },
    return_source_documents=True,  # Include source documents for reference
)

In [78]:
# Define your query
# query = "How to install langchain_community library?"
query="how Steaming events in python works? can you show me the code as well"
# Use the chain to get a response
result = qa_chain({"query": query})

# Print the generated response
print("Generated Code Explanation:")
print(result["result"])

# Optionally, include the retrieved documents
print("\nReferenced Documents:")
for doc in result["source_documents"]:
    print(doc.page_content[:200])  # Print a snippet of the document

Generated Code Explanation:
I'll explain how streaming events work in Python and provide some practical examples. Streaming allows you to process data in real-time or sequentially without loading everything into memory at once.

Here's a comprehensive explanation with examples:

1. Basic Event Streaming

```python
import asyncio
from typing import AsyncGenerator, Any
from datetime import datetime

class EventStream:
    """A simple event streaming implementation"""
    
    def __init__(self):
        self.listeners = []
        
    async def emit(self, event: Any):
        """Emit an event to all listeners"""
        for listener in self.listeners:
            await listener(event)
            
    def add_listener(self, listener):
        """Add a new listener to the stream"""
        self.listeners.append(listener)
        
    def remove_listener(self, listener):
        """Remove a listener from the stream"""
        self.listeners.remove(listener)

# Example usage
async def even

In [79]:
print("\nTop 5 Similar Code Snippets:")
for doc in result['source_documents']:
    print(f"markdown_below: {doc.metadata['markdown_below']}")
    print(f"markdown_above: {doc.metadata['markdown_above']}")
    print(f"cell_number: {doc.metadata["cell_number"]}")
    print(f"Code:\n{doc.page_content}\n")


Top 5 Similar Code Snippets:
markdown_below: ## Branch off a past state

Using LangGraph's checkpointing, you can do more than just replay past states. You can branch off previous locations to let the agent explore alternate trajectories or to let a user "version control" changes in a workflow.

Let's show how to do this to edit the state at a particular point in time. Let's update the state to instead of playing the song on Apple to play it on Spotify:

markdown_above: To replay from this place we just need to pass its config back to the agent. Notice that it just resumes from right where it left all - making a tool call.

cell_number: 19
Code:
for event in app.stream(None, to_replay.config):
    for v in event.values():
        print(v)

markdown_below: Markdown not found
markdown_above: **Resume**

We can now call the agent again with no inputs to continue.

This will run the tool as requested.

Running an interrupted graph with `None` in the inputs means to `proceed as if the inte

In [65]:
print(result['source_documents'])

[Document(metadata={'cell_number': 2, 'code': '! pip install -U langchain_community langchain-mistralai langchain langgraph', 'markdown_above': '# Code generation with self-correction\n\nAlphaCodium presented an approach for code generation that uses control flow.\n\nMain idea: [construct an answer to a coding question iteratively.](https://x.com/karpathy/status/1748043513156272416?s=20). \n\n[AlphaCodium](https://github.com/Codium-ai/AlphaCodium) iteravely tests and improves an answer on public and AI-generated tests for a particular question. \n\nWe will implement some of these ideas from scratch using [LangGraph](https://langchain-ai.github.io/langgraph/):\n\n1. We show how to route user questions to different types of documentation\n2. We we will show how to perform inline unit tests to confirm imports and code execution work\n3. We will show how to use LangGraph to orchestrate this\n\n![Screenshot 2024-05-23 at 2.17.51 PM.png](attachment:15d3ac32-cdf3-4800-a30c-f26d828d69c8.png)\n