In [15]:
import os
import sys
import argparse
import subprocess


def clone_repo(repo_url: str, dest: str = "./data") -> None:
    """
    Clone a Git repository from the given URL into the destination
    directory.

    Args:
        repo_url: HTTPS URL of the Git repository to clone.
        dest:    Path to the local directory where the repo will be cloned.
    """
    if not repo_url.startswith(("http://", "https://")):
        raise ValueError("The repository URL must start with http:// or https://")

    if not os.path.isdir(dest):
        os.makedirs(dest)
        print(f"Created directory: {os.path.abspath(dest)}")

    try:
        subprocess.run(
            ["git", "clone", repo_url, dest],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        print(f"Repository cloned into {os.path.abspath(dest)}")
    except subprocess.CalledProcessError as exc:
        print(f"Error cloning repository:\n{exc.stderr.decode().strip()}")
        sys.exit(1)

In [17]:
clone_repo("https://github.com/pixegami/langchain-rag-tutorial.git", "./data/project1")

Repository cloned into c:\Users\gabin\OneDrive\Bureau\Projets Perso\RAG_Explorer-for-repository\data\project1


In [43]:
from langchain.document_loaders import DirectoryLoader, TextLoader


def load_project_documents(project_path: str):
    """
    Load project files with specific extensions into Document objects.

    Args:
        project_path: Path to the cloned Git repository.

    Returns:
        A list of Document objects with metadata preserving file paths.
    """
    glob_patterns = [
        "**/*.py",    # Python source files
        "**/*.md",    # Markdown documentation
        "**/*.json",  # JSON config or data files
        "**/*.yaml",  # YAML config files
        "**/*.yml",   # YML config files
        "**/*.sh",    # Shell scripts
        # "**/*.ipynb", # jupyter notebook files
        "**/requirements.txt", # Only the requirements file
    ]
    try:
        loader = DirectoryLoader(
            path=project_path,
            glob=glob_patterns,
            recursive=True,
            show_progress=True,
            loader_cls=TextLoader,
        )
        documents = loader.load()
        return documents
    except Exception as error:
        print(f"Error loading documents from '{project_path}': {error}")
        return []


In [53]:
from pathlib import Path
from langchain.schema import Document
from typing import List
import os

def load_project_documents(project_path: str) -> List[Document]:
    """
    Manually load files, preserving all whitespace.
    """
    root = Path(project_path)
    if not root.is_dir():
        raise FileNotFoundError(f"{project_path} is not a valid directory.")

    patterns = [
        "*.py", "*.md", "*.json", "*.yaml",
        "*.yml", "*.sh", "requirements.txt",
    ]
    try:
        docs: List[Document] = []
        for pat in patterns:
            for file in root.rglob(pat):
                text = file.read_text(encoding="utf-8")  # keeps indentation
                docs.append(Document(page_content=text,
                                    metadata={"source": str(file)}))
                return docs
    except Exception as error:
        print(f"Error loading documents from '{project_path}': {error}")
        return []

    


In [47]:

path_to_repo = "./data"
documents = load_project_documents(path_to_repo)
for doc in documents:
    print(f"Source: {doc.metadata['source']}")
    print(doc.page_content[:200], "\n.......\n")

Source: data\project1\compare_embeddings.py
from langchain_openai import OpenAIEmbeddings
from langchain.evaluation import load_evaluator
from dotenv import load_dotenv
import openai
import os

# Load environment variables. Assumes that project 
.......

Source: data\project1\create_database.py
# from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langch 
.......

Source: data\project1\query_data.py
import argparse
# from dataclasses import dataclass
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langc 
.......

Source: data\project1\README.md
# Langchain RAG Tutorial

## Install dependencies

1. Do the following before installing the dependencies found in `requirements.txt` file because of current challenges installing `onnxruntime` throug 
.......

Sou

In [39]:
# src/rag_explorer/splitter.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
splitter.py

Split Python code documents into per-function chunks using AST,
and leave other docs intact.
"""

import ast
from typing import List

from langchain.schema import Document


def split_code_by_function(doc: Document) -> List[Document]:
    """
    Split a Python code document into one Document per function.

    Args:
        doc: Document whose page_content is Python source code,
             and metadata must include "source" (file path).

    Returns:
        A list of Documents, each containing a single function’s code
        and metadata["function_name"] set to that function’s name.
        If parsing fails or no functions found, returns [doc].
    """
    code = doc.page_content
    source_path = doc.metadata.get("source", "")
    try:
        tree = ast.parse(code)
    except SyntaxError as e:
        print(f"Return original document cannot be parsed:{e}")
        return [doc]

    lines = code.splitlines()
    split_docs: List[Document] = []

    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            start = node.lineno - 1
            # Use end_lineno if available (Python 3.8+), else last body line
            end = getattr(node, "end_lineno", node.body[-1].lineno)
            func_lines = lines[start:end]
            func_code = "\n".join(func_lines)
            metadata = doc.metadata.copy()
            metadata["function_name"] = node.name
            split_docs.append(Document(page_content=func_code, metadata=metadata))

    # If no functions found, keep the original doc
    return split_docs or [doc]


def split_documents_by_function(docs: List[Document]) -> List[Document]:
    """
    Apply split_code_by_function to all Python docs, leave others as-is.

    Args:
        docs: List of Document objects.

    Returns:
        Expanded list of Document objects, split per Python function.
    """
    result: List[Document] = []
    for doc in docs:
        source = doc.metadata.get("source", "")
        if source.endswith(".py"):
            result.extend(split_code_by_function(doc))
        else:
            result.append(doc)
    return result


In [52]:
Documents_splitted = split_documents_by_function(documents)
for doc in Documents_splitted:
    print(f"Source: {doc.metadata['source']}")
    try:
        print(f"Function_name: {doc.metadata['function_name']}")
    except:
        print("no function_name metadata.")
    print(doc.page_content[:200], "\n.......\n")

Source: data\project1\compare_embeddings.py
Function_name: main
def main():
    # Get embedding for a word.
    embedding_function = OpenAIEmbeddings()
    vector = embedding_function.embed_query("apple")
    print(f"Vector for 'apple': {vector}")
    print(f"Vect 
.......

Source: data\project1\create_database.py
Function_name: main
def main():
    generate_data_store() 
.......

Source: data\project1\create_database.py
Function_name: generate_data_store
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks) 
.......

Source: data\project1\create_database.py
Function_name: load_documents
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents 
.......

Source: data\project1\create_database.py
Function_name: split_text
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_o

In [56]:
# src/rag_explorer/embedder.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
embedder.py

Wraps OpenAI embedding model using LangChain to transform
documents into vector representations.
"""
import os 
from typing import List
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

class OpenAIEmbedder:
    """
    Wrapper for the OpenAI Embedding model using LangChain.
    """

    def __init__(self, model: str = "text-embedding-3-large") -> None:
        """
        Initialize the embedding model.

        Args:
            model: OpenAI embedding model name.
        """
        self.embedder = OpenAIEmbeddings(model=model)

    def embed_documents(self, documents: List[Document]) -> List[List[float]]:
        """
        Compute embeddings for a list of LangChain Documents.

        Args:
            documents: List of Document objects.

        Returns:
            A list of vector embeddings.
        """
        texts = [doc.page_content for doc in documents]
        return self.embedder.embed_documents(texts)

    def embed_query(self, query: str) -> List[float]:
        """
        Embed a single query string.

        Args:
            query: The input query text.

        Returns:
            A single vector embedding.
        """
        return self.embedder.embed_query(query)


In [59]:

embedder = OpenAIEmbedder()
embedding = embedder.embed_query("What is the function of this script?")

In [61]:
len(embedding)

3072