In [None]:
#|default_exp rag
#|export
import lancedb
from chonkie import RecursiveChunker
from model2vec import StaticModel
import subprocess
import re
import logging
import os
import pyarrow as pa

logging.basicConfig(level=logging.INFO)

# RAG Support for ShellSage

Implementing RAG functionality using local man pages.

In [None]:
#|export
def init_db(db_path="man_index.lance"):
    """Initialize or open a LanceDB database."""
    os.makedirs(os.path.dirname(db_path) if os.path.dirname(db_path) else '.', exist_ok=True)
    return lancedb.connect(db_path)

In [None]:
#|export
import lancedb
from chonkie import RecursiveChunker
from model2vec import StaticModel
import subprocess
import re
import logging
import os

logging.basicConfig(level=logging.INFO)

In [None]:
#|export
def get_man_pages():
    """Get all available man pages on the system."""
    result = subprocess.run(['apropos', '-l', '.'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    pages = []
    
    for line in result.stdout.splitlines():
        if not line.strip(): continue
        try:
            name, section = line.split('(', 1)
            section = section.split(')', 1)[0]
            name = name.strip()
            
            path = subprocess.check_output(['man', '-w', name], text=True).strip()
            if os.path.exists(path):
                pages.append({
                    'title': name,
                    'section': section,
                    'path': path
                })
        except Exception as e:
            logging.debug(f"Skipping line '{line}': {str(e)}")
            continue
            
    return pages

In [None]:
#|export
def read_man_page(path):
    """Read a man page and return its text content."""
    try:
        result = subprocess.run(['man', path], capture_output=True, text=True)
        return result.stdout if result.returncode == 0 else ""
    except Exception as e:
        logging.warning(f"Failed to read man page {path}: {str(e)}")
        return ""

In [None]:
#|export
def chunk_text(text):
    """Chunk text using Chonkie's RecursiveChunker."""
    chunker = RecursiveChunker(
        tokenizer="gpt2",
        chunk_size=512,
        min_characters_per_chunk=12
    )
    chunks = chunker.chunk(text)
    return [chunk.text for chunk in chunks]

In [None]:
#|export
def get_embeddings(texts):
    """Get embeddings using Model2Vec."""
    model = StaticModel.from_pretrained("minishlab/M2V_base_output")
    vectors = model.encode(texts)
    return [vector.tolist() for vector in vectors]

In [None]:
#|export
def create_chunks_table(db):
    """Create or replace the man page chunks table."""
    data = [{
        'title': '',
        'section': '',
        'chunk': '',
        'vector': [0.0] * 256
    }]
    return db.create_table("man_chunks", data=data, mode="overwrite")

In [None]:
#|export
def index_man_pages(db_path="man_index.lance"):
    """Index all man pages into the vector database."""
    db = init_db(db_path)
    table = create_chunks_table(db)
    
    pages = get_man_pages()
    logging.info(f"Found {len(pages)} man pages to index")
    
    for page in pages:
        try:
            logging.info(f"Processing {page['title']}({page['section']})")
            text = read_man_page(page['path'])
            if not text:
                logging.warning(f"Empty content for {page['title']}({page['section']})")
                continue
                
            chunks = chunk_text(text)
            if not chunks:
                logging.warning(f"No chunks created for {page['title']}({page['section']})")
                continue
                
            vectors = get_embeddings(chunks)
            
            data = [{
                "title": page['title'],
                "section": page['section'],
                "chunk": chunk,
                "vector": vector
            } for chunk, vector in zip(chunks, vectors)]
            
            if data:
                table.add(data)
                logging.info(f"Added {len(data)} chunks for {page['title']}({page['section']})")
        except Exception as e:
            logging.error(f"Failed to process {page['title']}({page['section']}): {str(e)}")
            continue

In [None]:
# Test the implementation
pages = get_man_pages()
print(f"Found {len(pages)} man pages")

if pages:
    # Show section distribution
    sections = {}
    for page in pages:
        sections[page['section']] = sections.get(page['section'], 0) + 1
    print("\nSection distribution:")
    for section, count in sorted(sections.items()):
        print(f"Section {section}: {count} pages")
    
    # Test with first page
    test_page = pages[0]
    print(f"\nTesting with: {test_page['title']}({test_page['section']})")
    text = read_man_page(test_page['path'])
    if text:
        print(f"Content length: {len(text)} chars")
        chunks = chunk_text(text)
        print(f"Created {len(chunks)} chunks")
        if chunks:
            print(f"Sample chunk: {chunks[0][:100]}...")
            vectors = get_embeddings(chunks[:1])
            print(f"Vector size: {len(vectors[0])}")
            
            # Test database creation
            test_db = "test_man_index.lance"
            if os.path.exists(test_db):
                import shutil
                shutil.rmtree(test_db)
            index_man_pages(test_db)
            print("\nVerifying database creation:", os.path.exists(test_db))