In [None]:
#|default_exp rag

# ShellSage Retrieval Augmented Generation

## Imports

In [None]:
#| export
from chonkie import SentenceChunker
from fastcore.all import *
from fastprogress.fastprogress import progress_bar
from lancedb import connect
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import LinearCombinationReranker
from lancedb.table import LanceTable
from pathlib import Path
from sentence_transformers import SentenceTransformer
from subprocess import check_output as co

import os, subprocess
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
import random

## Database setup

In [None]:
#| export
# set up db path in user's home cache directory
db_path = Path.home() / '.cache' / 'shell_sage' / 'db'
db_path.mkdir(parents=True, exist_ok=True)

In [None]:
#| export
chunker = SentenceChunker(tokenizer="gpt2", chunk_size=2_048,
                          chunk_overlap=256, min_sentences_per_chunk=1)

In [None]:
#| export
model = SentenceTransformer('thenlper/gte-small')
ndim = model.encode(["Example sentence"]).shape[1]

In [None]:
ndim

384

In [None]:
#| export
class EmbeddingTable(LanceModel):
    content: str
    package_name: str
    embedding: Vector(ndim)

In [None]:
#| export
db = connect(db_path)
tbl = None
try: tbl = db.open_table("man_pages")
except ValueError:
    tbl = db.create_table("man_pages", schema=EmbeddingTable, mode="create")
    tbl.create_fts_index("content") # for hybrid search

In [None]:
db_path.ls()

(#1) [Path('/Users/nathan/.cache/shell_sage/db/man_pages.lance')]

## Man pages

In [None]:
#| export
def _section(cmd, section):
    s = co(f'man {cmd} | col -b | sed -n "/^{section}/,/^[A-Z]/p" | sed "$d"',
              shell=True, stderr=subprocess.DEVNULL, text=True).strip()
    return '\n'.join(s.splitlines()[:-1]).strip()

In [None]:
print(_section('ls', 'SYNOPSIS')[:128])
print(_section('ls', 'DESCRIPTION')[:128])

SYNOPSIS
     ls [-@ABCFGHILOPRSTUWabcdefghiklmnopqrstuvwxy1%,] [--color=when]
	[-D format] [file ...]
DESCRIPTION
     For each operand that names a file of a type other than directory, ls
     displays its name as well as any req


In [None]:
#| export
def _get_page(cmd):
    name = _section(cmd, 'NAME')
    synopsis = _section(cmd, 'SYNOPSIS')
    description = _section(cmd, 'DESCRIPTION')
    examples = _section(cmd, 'EXAMPLES')
    return cmd, f"{name}\n\n{synopsis}\n\n{description}\n\n{examples}".strip()

In [None]:
print(_get_page('ls')[1][:256])
print('...')
print(_get_page('ls')[1][-512:-256])

NAME
     ls – list directory contents

SYNOPSIS
     ls [-@ABCFGHILOPRSTUWabcdefghiklmnopqrstuvwxy1%,] [--color=when]
	[-D format] [file ...]

DESCRIPTION
     For each operand that names a file of a type other than directory, ls
     displays its name as
...
used in conjunction with the -l option.

EXAMPLES
     List the contents of the current working directory in long format:

	   $ ls -l

     In addition to listing the contents of the current working directory in
     long format, show inode numbers, file 


In [None]:
#| export
def _manpages(lim=None):
    lines = L(co(['apropos', '-s', '1', '.'], text=True).strip().splitlines())
    lines = lines.map(lambda s: s.split("(")[0].strip())
    cmds = lines.filter(lambda s: s).unique()[:lim]
    pages = parallel(_get_page, cmds, progress=progress_bar).filter(lambda x: x[1])
    return zip(*pages)

In [None]:
cmds, pages = _manpages(lim=128)
len(pages)

127

In [None]:
rid = random.randint(0, len(pages))
cmd, page = cmds[rid], pages[rid]
print(cmd)
print(page[:128])
print('...')
print(page[-512:])

git-p4
NAME
       git-p4 - Import from and submit to Perforce repositories

SYNOPSIS
       git p4 clone [<sync-options>] [<clone-opti
...
 does a sync plus
       rebases the current branch onto the updated p4 remote branch.

EXAMPLES
       •   Clone a repository:

	       $ git p4 clone //depot/path/project


       •   Do some work in the newly created Git repository:

	       $ cd project
	       $ vi foo.h
	       $ git commit -a -m "edited foo.h"


       •   Update the Git repository with recent changes from p4, rebasing
	   your work on top:

	       $ git p4 rebase


       •   Submit your commits back to p4:

	       $ git p4 submit


## Indexing

In [None]:
#| export
def index_manpages(cmds, pages):
    for cmd, chunks in zip(cmds, chunker.chunk_batch(pages)):
        embds = model.encode([c.text for c in chunks])
        tbl.add([EmbeddingTable(content=c.text, package_name=cmd, embedding=emb)
                 for c, emb in zip(chunks, embds)])

In [None]:
index_manpages(cmds, pages)
len(tbl)

🦛 choooooooooooooooooooonk 100% • 127/127 docs chunked [00:00<00:00, 1093.21doc/s] 🌱


140

In [None]:
#| export
rerank = LinearCombinationReranker(weight=0.75)

@patch
@delegates(LanceTable.search)
def search(self:LanceTable, q: str = None, limit: int = 2, threshold: float=0.5, **kwargs):
    q_emb = model.encode([q])
    df = (self._orig_search(query_type='hybrid', **kwargs).text(q)
           .vector(q_emb).metric('cosine').rerank(rerank).limit(limit)
           .to_pandas())
    df = df.rename(columns={'_relevance_score': 'score'})
    return df[df.score > threshold]

In [None]:
df = tbl.search('How can I change my current branch?', limit=8)
df.head()

Unnamed: 0,content,package_name,embedding,score
0,\t Note the quotes around *.c. The file hell...,git-checkout,"[-0.041774247, -0.044317152, 0.0674934, 0.0194...",1.0
1,NAME\n git-checkout - Switch branches or...,git-checkout,"[-0.048017204, -0.049874607, 0.05528505, -0.00...",0.955922
2,"Thus you can, e.g., turn a library subd...",git-filter-branch,"[-0.043320876, -0.013584119, 0.04771582, 0.008...",0.64205
3,not to fetch them again. See also the p...,git-branch,"[-0.061778784, -0.031455133, 0.05729562, -0.01...",0.505731


In [None]:
doc = df.iloc[1]
print(doc.content[:512])
print(doc.package_name)

NAME
       git-checkout - Switch branches or restore working tree files

SYNOPSIS
       git checkout [-q] [-f] [-m] [<branch>]
       git checkout [-q] [-f] [-m] --detach [<branch>]
       git checkout [-q] [-f] [-m] [--detach] <commit>
       git checkout [-q] [-f] [-m] [[-b|-B|--orphan] <new-branch>] [<start-point>]
       git checkout [-f|--ours|--theirs|-m|--conflict=<style>] [<tree-ish>] [--] <pathspec>...
       git checkout [-f|--ours|--theirs|-m|--conflict=<style>] [<tree-ish>] --pathspec-from-fil
git-checkout


## CLI

In [None]:
#| export
@call_parse
def index():
    "Index man pages for RAG"
    cmds, pages = _manpages()
    index_manpages(cmds, pages)