In [None]:
#|default_exp rag

# RAG Support for ShellSage

Implementing RAG functionality using local man pages.

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|export
import subprocess
import logging
import os
import lancedb
import pyarrow as pa
from chonkie import RecursiveChunker
from model2vec import StaticModel

logging.basicConfig(level=logging.INFO)

In [None]:
#|export
def get_man_pages():
    """Get all available man pages on the system."""
    result = subprocess.run(['apropos', '-l', '.'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    pages = []
    
    for line in result.stdout.splitlines():
        if not line.strip(): continue
        try:
            name, section = line.split('(', 1)
            section = section.split(')', 1)[0]
            name = name.strip()
            
            path = subprocess.check_output(['man', '-w', name], text=True).strip()
            if os.path.exists(path):
                pages.append({
                    'title': name,
                    'section': section,
                    'path': path
                })
        except Exception as e:
            logging.debug("Skipping line " + str(line) + ": " + str(e))
            continue
            
    return pages

In [None]:
#|export
def read_man_page(path):
    """Read a man page and return its text content."""
    try:
        result = subprocess.run(['man', path], capture_output=True, text=True)
        return result.stdout if result.returncode == 0 else ""
    except Exception as e:
        logging.warning("Failed to read man page " + str(path) + ": " + str(e))
        return ""

In [None]:
#|export
def chunk_text(text):
    """Chunk text using Chonkie's RecursiveChunker."""
    chunker = RecursiveChunker(
        tokenizer="gpt2",
        chunk_size=512,
        min_characters_per_chunk=12
    )
    chunks = chunker.chunk(text)
    return [chunk.text for chunk in chunks]

In [None]:
#|export
def get_embeddings(texts):
    """Get embeddings using Model2Vec."""
    model = StaticModel.from_pretrained("minishlab/M2V_base_output")
    vectors = model.encode(texts)
    return [vector.tolist() for vector in vectors]

In [None]:
#|export
def init_db(db_path="man_index.lance"):
    """Initialize or open a LanceDB database."""
    return lancedb.connect(db_path)

def create_chunks_table(db):
    """Create or replace the man page chunks table."""
    schema = pa.schema([
        pa.field("title", pa.string()),
        pa.field("section", pa.string()),
        pa.field("chunk", pa.string()),
        pa.field("vector", pa.list_(pa.float32(), 256))
    ])
    return db.create_table("man_chunks", schema=schema, mode="overwrite")

In [None]:
#|export
def index_cmd(db_path="man_index.lance"):
    """Index all man pages into the vector database."""
    db = init_db(db_path)
    table = create_chunks_table(db)
    
    pages = get_man_pages()
    logging.info(f"Found {len(pages)} man pages to index")
    
    for page in pages:
        try:
            logging.info(f"Processing {page['title']}({page['section']})")
            text = read_man_page(page['path'])
            if not text:
                continue
                
            chunks = chunk_text(text)
            if not chunks:
                continue
                
            vectors = get_embeddings(chunks)
            
            data = [{
                "title": page['title'],
                "section": page['section'],
                "chunk": chunk,
                "vector": vector
            } for chunk, vector in zip(chunks, vectors)]
            
            if data:
                table.add(data)
                logging.info(f"Added {len(data)} chunks for {page['title']}({page['section']})")
        except Exception as e:
            logging.error(f"Failed to process {page['title']}({page['section']}): {str(e)}")
            continue

In [None]:
#|export
def query_man_pages(query, top_k=5, db_path="man_index.lance"):
    """Search indexed man pages for relevant information."""
    db = init_db(db_path)
    table = db.open_table("man_chunks")
    
    model = StaticModel.from_pretrained("minishlab/M2V_base_output")
    query_vector = model.encode([query])[0].tolist()
    
    results = table.search(query_vector).limit(top_k).to_list()
    return [(r["title"], r["section"], r["chunk"], r["_distance"]) for r in results]

In [None]:
#|export
def search_cmd(query, top_k=5, db_path="man_index.lance"):
    """Search indexed man pages for relevant information."""
    db = init_db(db_path)
    table = db.open_table("man_chunks")
    
    # Get query embedding
    model = StaticModel.from_pretrained("minishlab/M2V_base_output")
    query_vector = model.encode([query])[0].tolist()
    
    # Search and return results
    results = table.search(query_vector).limit(top_k).to_list()
    
    for result in results:
        print(f"\n=== {result['title']}({result['section']}) ===")
        print(result['chunk'])
        print(f"Similarity score: {result['_distance']}")
    
    return results

In [None]:
# Test the implementation
pages = get_man_pages()[:3]  # Test with first 3 pages
print(f"Testing with {len(pages)} man pages")

# Initialize database
db = init_db()
table = create_chunks_table(db)

# Process pages
for page in pages:
    print(f"
Processing {page['title']}({page['section']})")
    text = read_man_page(page['path'])
    if text:
        chunks = chunk_text(text)
        if chunks:
            vectors = get_embeddings(chunks)
            data = [{
                "title": page['title'],
                "section": page['section'],
                "chunk": chunk,
                "vector": vector
            } for chunk, vector in zip(chunks, vectors)]
            table.add(data)
            print(f"Added {len(data)} chunks")

# Test querying
print("
Testing search...")
query = "how to list files"
results = query_man_pages(query)
for title, section, chunk, score in results[:2]:
    print(f"
=== {title}({section}) ===")
    print(f"Score: {score}")
    print(chunk[:200] + "...")