# Extending an existing pipeline


In [None]:
#| default_exp tutorials.extend

In [None]:
#| hide
from nbdev.showdoc import show_doc
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2

In [None]:
#| export
# importing dependencies
import re
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from spannerlib import get_magic_session,Session,Span
import ast

## IE functions and logic from previous implementations



In [None]:
#| export
from spannerlib.tutorials.basic import llm_ie,format_ie,string_schema
from spannerlib.tutorials.copilot import ast_xpath,ast_to_span,lex_concat

In [None]:
sess = get_magic_session()
sess.register('llm',llm_ie,[str,str],[str])
sess.register('format', format_ie, string_schema,[str])
sess.register('ast_xpath',ast_xpath,[(str,Path,Span),str],[ast.AST])
sess.register('ast_to_span',ast_to_span,[(str,Span,Path),ast.AST],[Span])
sess.register_agg('lex_concat',lex_concat,[(str,Span)],[str])

In [None]:
code_file = Path('copilot_data/example_code.py')

example_files = pd.DataFrame([(Span(code_file),)])
cursors =pd.DataFrame([(Span(code_file,16,17),)])

sess.import_rel('Files',example_files)
sess.import_rel('Cursors',cursors)

func_document_prompt = """system: based on the following context:
{}
Explain the following function:
{}
In the format of a doc string.
"""
sess.import_var('func_document_prompt',func_document_prompt)

In [None]:
sess.remove_all_rules()

In [None]:
%%spannerlog
FuncDefSpan(span,name)<-\
    Files(text),\
    ast_xpath(text, "//FunctionDef")->(node),\
    ast_to_span(text,node)->(span),\
    expr_eval("{0}.name",node)->(name)

FuncCallSpan(span,name)<-\
    Files(text),\
    ast_xpath(text, "//Call/func/Name")->(node),\
    ast_to_span(text,node)->(span),\
    as_str(span)->(name)

CallingFunc(cursor,name)<-\
    Cursors(cursor),\
    FuncDefSpan(span,name),\
    span_contained(cursor,span)->(True)

Mentions(lex_concat(caller_span),called_name)<-\
    FuncCallSpan(called_span,called_name),\
    FuncDefSpan(caller_span,caller_name),\
    span_contained(called_span,caller_span)->(True)

model = 'gpt-3.5-turbo'
DocumentFunctionPrompt(cursor,prompt)<-\
    CallingFunc(cursor,name),\
    Mentions(mentions,name),\
    FuncDefSpan(def_span,name),\
    as_str(def_span)->(def_string),\
    format($func_document_prompt,mentions,def_string)->(prompt)

DocumentFunction(cursor,answer)<-\
    DocumentFunctionPrompt(cursor,prompt),\
    llm($model,prompt)->(answer)


## Adding RAG

### building a vecdb IE function

In [None]:
#| export
import faiss
import numpy as np
import openai
from collections import defaultdict
from openai import OpenAI
client = OpenAI()


def get_openai_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-ada-002",  # or another embedding model
        input=texts
    )
    embeddings = [item.embedding for item in response.data]
    return np.array(embeddings)

In [None]:
#| export
class VecDB():
    def __init__(self):
        self.index_map={}# namespace: index
        self.doc_map=defaultdict(list)# namespace: list of docs
        self.dim = 1536
    def add_index(self,namespace):
        self.index_map[namespace] = faiss.IndexFlatL2(self.dim)

    def add_docs(self,documents,namespace='default'):
        if not namespace in self.index_map:
            self.add_index(namespace)
        documents = [str(doc) for doc in documents]
        embeddings = get_openai_embeddings(documents)
        self.index_map[namespace].add(embeddings.astype('float32'))
        self.doc_map[namespace].extend(documents)

    def search(self, query, k=1,namespace='default'):
        query_embedding = get_openai_embeddings([query])[0]
        index = self.index_map[namespace]
        documents = self.doc_map[namespace]
        D, I = index.search(np.array([query_embedding]).astype('float32'), k)
        return [(documents[i], float(D[0][j])) for j, i in enumerate(I[0])]

In [None]:
documents = [
    "FAISS is a library for efficient similarity search.",
    "Vector databases are crucial for RAG pipelines.",
    "FAISS was developed by Facebook AI Research.",
    "RAG combines retrieval and generation for better results."
]

In [None]:
db=VecDB()

In [None]:
db.add_docs(documents)

In [None]:
db.search("RAG?",4)

[('RAG combines retrieval and generation for better results.',
  0.22323353588581085),
 ('Vector databases are crucial for RAG pipelines.', 0.3760342001914978),
 ('FAISS was developed by Facebook AI Research.', 0.5168014168739319),
 ('FAISS is a library for efficient similarity search.', 0.5336617231369019)]

In [None]:
sess.register('vector_search',db.search,[(str,Span),int,str],[str,float])

### Adding stack overflow posts to vector DB

In [None]:
docs = Path('copilot_data/stackoverflow_posts.txt').read_text().split('DELIM')
docs = [doc.strip() for doc in docs]
docs

['1. **Use clear and concise language**\n   Always strive for clarity in your documentation. Use simple, straightforward language and provide examples:\n\n   ```python\n   def calculate_area(length, width):\n       """\n       Calculate the area of a rectangle.\n\n       :param length: The length of the rectangle\n       :param width: The width of the rectangle\n       :return: The area of the rectangle\n       """\n       return length * width\n   ```',
 '2. **Include code examples with comments**\n   Provide relevant code snippets with inline comments to explain each step:\n\n   ```javascript\n   // Function to calculate factorial\n   function factorial(n) {\n       if (n === 0 || n === 1) {\n           return 1; // Base case: 0! and 1! are 1\n       } else {\n           return n * factorial(n - 1); // Recursive case\n       }\n   }\n   ```',
 "3. **Structure your documentation with markdown**\n   Use markdown to structure your documentation for better readability:\n\n   ```markdown\

In [None]:
db.add_docs(docs,namespace='stackoverflow')

### Extending our pipeline

In [None]:
%%spannerlog
?DocumentFunctionPrompt(C,P)

'?DocumentFunctionPrompt(C,P)'

Unnamed: 0,C,P
0,"[@example_code.py,16,17) ""x""","system: based on the following context: def g(x,y):  return f(x,y)**2 def method(self, y):  return f(self.x, y) Explain the following function: def f(x,y):  x+y In the format of a doc string."


In [None]:
sess.remove_head('Test')

In [None]:
%%spannerlog
RagContext(cursor,lex_concat(context))<-\
    DocumentFunctionPrompt(cursor,prompt),\
    vector_search(prompt,4,'stackoverflow')->(context,similarity_score)
?RagContext(C,context)

'?RagContext(C,context)'

Unnamed: 0,C,context
0,"[@example_code.py,16,17) ""x""","1. **Use clear and concise language**  Always strive for clarity in your documentation. Use simple, straightforward language and provide examples:  ```python  def calculate_area(length, width):  """"""  Calculate the area of a rectangle.  :param length: The length of the rectangle  :param width: The width of the rectangle  :return: The area of the rectangle  """"""  return length * width  ``` 12. **Provide examples of input and output**  When documenting functions or APIs, include examples of expected inputs and outputs:  ```python  def square(n):  """"""  Return the square of a number.  :param n: The number to square  :return: The square of the input number  Example:  >>> square(4)  16  >>> square(-3)  9  """"""  return n ** 2  ``` 13. **Use docstrings for inline documentation**  Use docstrings to provide inline documentation:  ```python  class MyClass:  """"""  A class that represents MyClass.  Attributes:  attr1 (int): Description of attr1  attr2 (str): Description of attr2  """"""  def __init__(self, attr1, attr2):  self.attr1 = attr1  self.attr2 = attr2  def my_method(self, param1):  """"""  Description of my_method.  :param param1: Description of param1  :return: Description of return value  """"""  pass  ``` 15. **Provide context and explanations in comments**  Don't just describe what something does, explain why it's important:  ```python  # We use a cache to store expensive computation results  # This significantly improves performance for repeated calls  cache = {}  def expensive_function(n):  if n in cache:  return cache[n]  result = # ... some expensive computation  cache[n] = result  return result  ```"


In [None]:
rag_prompt = """system: Based on the following context
{}
answer the following question
{}
"""

sess.import_var('rag_prompt',rag_prompt)

In [None]:
%%spannerlog
RagPrompt(cursor,prompt)<-\
    RagContext(cursor,context),\
    DocumentFunctionPrompt(cursor,document_promps),\
    format($rag_prompt,context,document_promps)->(prompt)

RagCompletion(cursor,answer)<-\
    RagPrompt(cursor,prompt),\
    llm($model,prompt)->(answer)

?RagCompletion(cursor,answer)

In [None]:
# TODO from here rag done, now show Few shot

In [None]:
raise Exception("Stop here")

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()