In [None]:
#|default_exp rag

# RAG Support for ShellSage

This notebook implements Retrieval-Augmented Generation (RAG) support for ShellSage using local man pages.

In [None]:
#|export
import lancedb
from chonkie import RecursiveChunker
from model2vec import Model2Vec
from pathlib import Path
from typing import List, Dict, Tuple
import subprocess
import re

## Man Page Extraction

In [None]:
#|export
def get_man_pages() -> List[Dict[str, str]]:
    """Get all available man pages on the system.
    Returns a list of dicts containing title, section, and path for each man page."""
    result = subprocess.run(['apropos', '.'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    pages = []
    
    for line in result.stdout.splitlines():
        if not line.strip(): continue
        match = re.match(r'([^(]+)\(([^)]+)\)\s*-\s*(.+)', line)
        if not match: continue
            
        name, section, desc = match.groups()
        name = name.strip()
        
        try:
            path = subprocess.check_output(['man', '-w', name], text=True).strip()
            pages.append({
                'title': name,
                'section': section,
                'description': desc.strip(),
                'path': path
            })
        except:
            continue
            
    return pages

In [None]:
#|export
def read_man_page(path: str) -> str:
    """Read a man page and return its text content."""
    try:
        result = subprocess.run(['man', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return result.stdout
    except:
        return ""

## Text Chunking

In [None]:
#|export
def chunk_text(text: str) -> List[str]:
    """Chunk text using Chonkie's RecursiveChunker."""
    chunker = RecursiveChunker()
    chunks = chunker(text)
    return [chunk.text for chunk in chunks]

## Vector Embeddings

In [None]:
#|export
def get_embeddings(texts: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of text chunks using Model2Vec."""
    model = Model2Vec("minishlab/M2V_base_output")
    return [model.embed(text) for text in texts]

## Testing Functions

In [None]:
# Test man page extraction
pages = get_man_pages()
print(f"Found {len(pages)} man pages\n")
print("Example entries:")
for page in pages[:3]:
    print(f"\nTitle: {page['title']}")
    print(f"Section: {page['section']}")
    print(f"Description: {page['description']}")
    print(f"Path: {page['path']}")

## LanceDB Integration

In [None]:
#|export
def init_db(db_path: str = "man_index.lance") -> lancedb.db.LanceDB:
    """Initialize or open a LanceDB database."""
    return lancedb.connect(db_path)

def create_chunks_table(db: lancedb.db.LanceDB):
    """Create or replace the man page chunks table."""
    return db.create_table(
        "man_chunks",
        data=[{
            "title": "",
            "section": "",
            "chunk": "",
            "vector": get_embeddings([""])[0]  # Get schema from empty embedding
        }],
        mode="create_or_replace"
    )

In [None]:
#|export
def index_man_pages(db_path: str = "man_index.lance"):
    """Index all man pages into the vector database."""
    db = init_db(db_path)
    table = create_chunks_table(db)
    
    pages = get_man_pages()
    for page in pages:
        text = read_man_page(page['path'])
        if not text: continue
            
        chunks = chunk_text(text)
        vectors = get_embeddings(chunks)
        
        # Insert chunks and vectors
        for chunk, vector in zip(chunks, vectors):
            table.add([{
                "title": page['title'],
                "section": page['section'],
                "chunk": chunk,
                "vector": vector
            }])

In [None]:
#|export
def query_man_pages(query: str, top_k: int = 5, db_path: str = "man_index.lance") -> List[Dict]:
    """Query the man page index for relevant chunks."""
    db = init_db(db_path)
    table = db.open_table("man_chunks")
    
    # Get query embedding
    query_vector = get_embeddings([query])[0]
    
    # Search for similar chunks
    results = table.search(query_vector).limit(top_k).to_list()
    return results

## Example Usage

In [None]:
# Index some man pages (this may take a while)
index_man_pages()

# Try a query
results = query_man_pages("how to list files in a directory")
for r in results:
    print(f"\nFrom: {r['title']}({r['section']})")
    print(f"Chunk: {r['chunk'][:200]}...")