# Code Documentation Task


In [None]:
#| default_exp tutorials.copilot

In [None]:
#| hide
from nbdev.showdoc import show_doc
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2

In [None]:
#| export
# importing dependencies
import re
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from spannerlib import get_magic_session,Session,Span


In this tutorial we will show how to augment LLM context using formal rule based ie techniques, by presenting a copilot like code documentation pipeline.

Normally, when running a model like copilot in an IDE like pycharm or vs-code, it is given the context of the last $k$ files in our editor history as context for the code completion.

In this tutorial, we will see how we can use infromation extraction of spans of code based on the structure of the code to improve the context of the LLM.
First, let us get the `llm` and `format` ie function from the basic tutorial.

In [None]:
#| export
from spannerlib.tutorials.basic import llm_ie,format_ie,string_schema

In [None]:
sess = get_magic_session()
sess.register('llm',llm_ie,[str,str],[str])
sess.register('format', format_ie, string_schema,[str])


In order to analyze the structure of the code, we will be using python's `ast` module.
We will write a very generic ie function that gets a piece of code, and an xpath query string returns the spans of all matches of the query over the ast of the given code.

To do so we will use the `pyastgrep` library that allows us to look for xpath matches in python ast's.
We will write a modified version of it's main function that returns Spans of the ast nodes.

In [None]:
#| export
# ! pip install pyastgrep
import ast
from functools import cache
from pyastgrep.search import search_python_files,Match
from pyastgrep.asts import ast_to_xml
from lxml import etree


In [None]:
#| export
@cache
def py_to_xml(py:str)->str:
    ast_tree = ast.parse(py)
    node_mappings = {}
    xml_tree = ast_to_xml(ast_tree, node_mappings)
    return xml_tree,ast_tree,node_mappings

def xml_to_string(xml_tree):
    return etree.tostring(xml_tree, pretty_print=True).decode('utf-8')

def print_file_xml(file_path):
    text = Path(file_path).read_text()
    xml_tree,_,_ = py_to_xml(text)
    print(xml_to_string(xml_tree))


def ast_to_string(ast_tree):
    if isinstance(ast_tree,ast.AST):
        return ast.unparse(ast_tree)
    else:
        return ast_tree

def ast_xpath(py_str,xpath_query):
    if isinstance(py_str,Path):
        py_str = py_str.read_text()
    if isinstance(py_str,Span):
        py_str = str(py_str)
    xml_tree,ast_tree,node_mappings = py_to_xml(py_str)
    xml_matches = xml_tree.xpath(xpath_query)
    ast_matches = [node_mappings[match] if match in node_mappings else match for match in xml_matches]
    return ast_matches

In [None]:
code_file = Path('copilot_data/example_code.py')
code_text = code_file.read_text()
code_text

'def f(x,y):\n    x+y \n\ndef g(x,y):\n    return f(x,y)**2\n\nclass A:\n    def __init__(self, x):\n        self.x = x\n    def method(self, y):\n        return f(self.x, y)\n\nprint(f(2,3))'

In [None]:
x_t,a_t,n_m = py_to_xml(code_text)
# print_file_xml(code_file)

In [None]:
print(ast_to_string(a_t))

def f(x, y):
    x + y

def g(x, y):
    return f(x, y) ** 2

class A:

    def __init__(self, x):
        self.x = x

    def method(self, y):
        return f(self.x, y)
print(f(2, 3))


In [None]:
for match in ast_xpath(code_file,'//FunctionDef'):
    print(ast_to_string(match))

for match in ast_xpath(code_text,'//FunctionDef/@name'):
    print(ast_to_string(match))


def f(x, y):
    x + y
def g(x, y):
    return f(x, y) ** 2
def __init__(self, x):
    self.x = x
def method(self, y):
    return f(self.x, y)
f
g
__init__
method


In [None]:
#| export
@cache
def _get_lines(path):
    if isinstance(path,Path):
        tuple(path.read_text().split('\n'))
    else:
        return tuple(path.split('\n'))

def get_character_position(path, line_number, column_offset):
    """gets a character position from a line number and column offset"""
    lines = _get_lines(path)
    if line_number < 1 or line_number > len(lines):
        raise ValueError("Invalid line number")
    line = lines[line_number - 1]
    if column_offset < 0 or column_offset > len(line):
        raise ValueError("Invalid column offset")
    return sum(len(lines[i]) + 1 for i in range(line_number - 1)) + column_offset

def ast_to_span(string,node):
    """given a node <node> of an ast from file <path>,
    returns the location of the node in the file as a Span object"""
    if isinstance(string,Path):
        text = string.read_text()
        name = string.name
    else:
        text = string
        name = None
    start = get_character_position(str(text),node.lineno,node.col_offset)
    if hasattr(node,'end_lineno') and hasattr(node,'end_col_offset'):
        end = get_character_position(str(text),node.end_lineno,node.end_col_offset)
    else:
        end = start + len(ast.unparse(node))
    return [Span(text,start,end,name=name)]

In [None]:
matches = ast_xpath(code_text,'//FunctionDef')
m = matches[0]

In [None]:
span = ast_to_span(code_file,m)[0]
span,str(span)

([@example_code.py,0,19) "def f(x,y)...", 'def f(x,y):\n    x+y')

In [None]:
sess.register('ast_xpath',ast_xpath,[(str,Path,Span),str],[ast.AST])
sess.register('ast_to_span',ast_to_span,[(str,Span,Path),ast.AST],[Span])

In [None]:
code_file

Path('copilot_data/example_code.py')

In [None]:
example_files = pd.DataFrame(
    [(Span(code_file),)]
)
example_files.map(repr)

Unnamed: 0,0
0,"[@example_code.py,0,178) ""def f(x,y)..."""


In [None]:
cursors =pd.DataFrame([(Span(code_file,16,17),)])
cursors.map(repr)


Unnamed: 0,0
0,"[@example_code.py,16,17) ""x"""


In [None]:
sess.import_rel('Files',example_files)
sess.import_rel('Cursors',cursors)

In [None]:
#| hide
# used when debugging to delete rules
sess.remove_all_rules()

In [None]:
%%spannerlog

FuncDefSpan(span,name)<-\
    Files(text),\
    ast_xpath(text, "//FunctionDef")->(node),\
    ast_to_span(text,node)->(span),\
    expr_eval("{0}.name",node)->(name)

?FuncDefSpan(span,name)


'?FuncDefSpan(span,name)'

Unnamed: 0,span,name
0,"[@example_code.py,0,19) ""def f(x,y)...""",f
1,"[@example_code.py,22,54) ""def g(x,y)...""",g
2,"[@example_code.py,69,110) ""def __init...""",__init__
3,"[@example_code.py,115,163) ""def method...""",method


In [None]:
%%spannerlog

FuncCallSpan(span,name)<-\
    Files(text),\
    ast_xpath(text, "//Call/func/Name")->(node),\
    ast_to_span(text,node)->(span),\
    as_str(span)->(name)

?FuncCallSpan(span,name)

'?FuncCallSpan(span,name)'

Unnamed: 0,span,name
0,"[@example_code.py,45,46) ""f""",f
1,"[@example_code.py,151,152) ""f""",f
2,"[@example_code.py,165,170) ""print""",print
3,"[@example_code.py,171,172) ""f""",f


In [None]:
%%spannerlog

CallingFunc(cursor,name)<-\
    Cursors(cursor),\
    FuncDefSpan(span,name),\
    span_contained(cursor,span)->(True)

?CallingFunc(cursor,name)

'?CallingFunc(cursor,name)'

Unnamed: 0,cursor,name
0,"[@example_code.py,16,17) ""x""",f


In [None]:
#| export
def lex_concat(strings):
    return '\n'.join(sorted([str(s) for s in strings]))

In [None]:
sess.register_agg('lex_concat',lex_concat,[(str,Span)],[str])

In [None]:
sess.remove_head('Mentions')

In [None]:
%%spannerlog
Mentions(lex_concat(caller_span),called_name)<-\
    FuncCallSpan(called_span,called_name),\
    FuncDefSpan(caller_span,caller_name),\
    span_contained(called_span,caller_span)->(True)

?Mentions(caller,called)

#TODO from here debug of relation, do it in an agg func where the first column is agged
    

'?Mentions(caller,called)'

Unnamed: 0,caller,called
0,"def g(x,y):  return f(x,y)**2 def method(self, y):  return f(self.x, y)",f


In [None]:
# TODO error checking on roles in prompt string

In [None]:
func_document_prompt = """
system: based on the following context:
{}
Explain the following function:
{}
In the format of a doc string.
"""
sess.import_var('func_document_prompt',func_document_prompt)

In [None]:
sess.remove_head('DocumentFunction')

In [None]:
%%spannerlog

model = 'gpt-3.5-turbo'
DocumentFunction(cursor,prompt,string)<-\
    CallingFunc(cursor,name),\
    Mentions(mentions,name),\
    FuncDefSpan(def_span,name),\
    as_str(def_span)->(def_string),\
    format($func_document_prompt,mentions,def_string)->(prompt),\
    llm($model,prompt)->(string)

?DocumentFunction(cursor,prompt,doc_string)

'?DocumentFunction(cursor,prompt,doc_string)'

Unnamed: 0,cursor,prompt,doc_string
0,"[@example_code.py,16,17) ""x""","system: based on the following context: def g(x,y):  return f(x,y)**2 def method(self, y):  return f(self.x, y) Explain the following function: def f(x,y):  x+y In the format of a doc string.",""""""" This function calculates the sum of two inputs x and y. """""""


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()