# Spanner Workbench Sneak Peak

## installation

In [1]:
%%bash
# git clone https://github.com/DeanLight/spanner_workbench
# cd spanner_workbench
# pip install -e src/rgxlog-interpreter 

## Datalog magic

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import rgxlog
from rgxlog import magic_client,Session
import rgxlog.magic.rgxlog_magic

In [3]:
%%spanner
# define relation and some facts about it
new parent(str ,str)
parent("bob", "greg")
parent("greg", "alice")
parent("greg", "janice")
# now add a rule that deduces that bob is a grandparent of alice
grandparent(X,Z) <- parent(X,Y), parent(Y,Z) # ',' is a short hand to the 'and' operator

In [4]:
%%spanner
# now for the queries
?parent("greg", ChildsOfGreg) # returns all children of greg
?grandparent(X, Y) # returns all tuples in the grandparent relation

printing results for query 'parent("greg", ChildsOfGreg)':
  ChildsOfGreg
----------------
     janice
     alice

printing results for query 'grandparent(X, Y)':
  X  |   Y
-----+--------
 bob | janice
 bob | alice



## python IO

In [5]:
! cat sons.csv

bob,adam
adam,oren
jane,bob
elmo,bernie
bernie,casie

In [6]:
magic_client.import_relation_from_csv("sons.csv",relation_name="parent",delimiter=",")

In [7]:
%%spanner
?parent(X,Y) # returns all tuples that

printing results for query 'parent(X, Y)':
   X    |   Y
--------+--------
 bernie | casie
  elmo  | bernie
  jane  |  bob
  adam  |  oren
  bob   |  adam
  greg  | janice
  greg  | alice
  bob   |  greg



In [8]:
%%spanner
?grandparent(X, Y)

printing results for query 'grandparent(X, Y)':
  X   |   Y
------+--------
 elmo | casie
 jane |  greg
 jane |  adam
 bob  |  oren
 bob  | janice
 bob  | alice



In [9]:
grand_parents=magic_client.query_into_df("?grandparent(Grandparent, Grandchild)")
grand_parents

Unnamed: 0,Grandparent,Grandchild
0,elmo,casie
1,jane,greg
2,jane,adam
3,bob,oren
4,bob,janice
5,bob,alice


## Adding ie functions - regexes

In [10]:
%load_ext autoreload
%autoreload 2

from rgxlog.engine.datatypes.primitive_types import DataTypes
from rgxlog.stdlib.regex import rgx_string_out_types
import re


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### defining ie function in an imperative language

In [11]:
def rgx_string(text, regex_formula):
    """
    Args:
        text: The input text for the regex operation
        regex_formula: the formula of the regex operation

    Returns: tuples of strings that represents the results
    """
    compiled_rgx = re.compile(regex_formula)
    num_groups = compiled_rgx.groups
    for match in re.finditer(compiled_rgx, text):
        if num_groups == 0:
            matched_strings = [match.group()]
        else:
            matched_strings = [group for group in match.groups()]
        yield matched_strings

### registering it with the engine

In [12]:
magic_client.register(
                ie_function=rgx_string,
                ie_function_name='RGXString',
                in_rel=[DataTypes.string, DataTypes.string],
                out_rel=rgx_string_out_types,
                )

### Executing queries with the ie function

In [13]:
%%spanner
bank_records= "bob 100 elmo 80 jane 79 alice 60"
money(P, Money) <- \
RGXString(bank_records, "(\w+).*?(\d+)")->(P, Money)



In [14]:
%%spanner
?money(P,Money)

printing results for query 'money(P, Money)':
   P   |   Money
-------+---------
 alice |      60
 jane  |      79
 elmo  |      80
  bob  |     100



### A more complete example

In [15]:
%%spanner
new lecturer(str, str)
new enrolled(str, str)


In [16]:
lecturers=pd.DataFrame(
    [
        ("walter", "chemistry"),
        ("linus", "operation systems"),
        ("rick", "physics")
    ])
enrolled=pd.DataFrame(
    [
        ("abigail", "chemistry"),
        ("abigail", "operation systems"),
        ("jordan", "chemistry"),
        ("gale", "operation systems"),
        ("howard", "chemistry"),
        ("howard", "physics")
    ])

In [17]:
magic_client.import_relation_from_df(lecturers,relation_name="lecturer")
magic_client.import_relation_from_df(enrolled,relation_name="enrolled")

In [18]:
%%spanner
?lecturer(X,Y)
?enrolled(X,Y)

enrolled_in_chemistry(X) <- enrolled(X, "chemistry")
?enrolled_in_chemistry("jordan") # returns empty tuple ()
?enrolled_in_chemistry("gale") # returns nothing
?enrolled_in_chemistry(X) # returns "abigail", "jordan" and "howard"

enrolled_in_physics_and_chemistry(X) <- enrolled(X, "chemistry"), enrolled(X, "physics")
?enrolled_in_physics_and_chemistry(X) # returns "howard"

lecturer_of(X,Z) <- lecturer(X,Y), enrolled(Z,Y)
?lecturer_of(X,"abigail") # returns "walter" and "linus"

gpa_str = "\n abigail 100\n jordan 80\n gale 79\n howard 60\n"
gpa_of_chemistry_students(Student, Grade) <- RGXString(gpa_str,"(\w+).*?(\d+)")->(Student, Grade),enrolled_in_chemistry(Student)
?gpa_of_chemistry_students(X, "100") # returns "abigail"

printing results for query 'lecturer(X, Y)':
   X    |         Y
--------+-------------------
  rick  |      physics
 linus  | operation systems
 walter |     chemistry

printing results for query 'enrolled(X, Y)':
    X    |         Y
---------+-------------------
 howard  |      physics
 howard  |     chemistry
  gale   | operation systems
 jordan  |     chemistry
 abigail | operation systems
 abigail |     chemistry

printing results for query 'enrolled_in_chemistry("jordan")':
[()]

printing results for query 'enrolled_in_chemistry("gale")':
[]

printing results for query 'enrolled_in_chemistry(X)':
    X
---------
 howard
 jordan
 abigail

printing results for query 'enrolled_in_physics_and_chemistry(X)':
   X
--------
 howard

printing results for query 'lecturer_of(X, "abigail")':
   X
--------
 linus
 walter

printing results for query 'gpa_of_chemistry_students(X, "100")':
    X
---------
 abigail



In [19]:
%%spanner
?gpa_of_chemistry_students(X, Y) 

printing results for query 'gpa_of_chemistry_students(X, Y)':
    X    |   Y
---------+-----
 howard  |  60
 jordan  |  80
 abigail | 100



## NLP example

In [20]:
%load_ext autoreload
%autoreload 2

from rgxlog.stdlib.nlp import   lemma_wrapper, ner_wrapper, dependency_parse_wrapper                                


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
sentence = "John fathered Greg wonderfully"
print("NER")
print(list(ner_wrapper(sentence)))
print("lemma")
print(list(lemma_wrapper(sentence)))
print("dep_parse")
print(list(dependency_parse_wrapper(sentence)))


NER
[('John', 'PERSON', (0, 4)), ('Greg', 'PERSON', (14, 18))]
lemma
[('John', 'John', (0, 4)), ('fathered', 'father', (5, 13)), ('Greg', 'Greg', (14, 18)), ('wonderfully', 'wonderfully', (19, 30))]
dep_parse
[('ROOT', 0, 'ROOT', 2, 'fathered'), ('nsubj', 2, 'fathered', 1, 'John'), ('obj', 2, 'fathered', 3, 'Greg'), ('advmod', 2, 'fathered', 4, 'wonderfully')]


In [22]:
magic_client.register(ie_function=lemma_wrapper,
             ie_function_name='Lemma',
             in_rel=[DataTypes.string],
             out_rel=[DataTypes.string, DataTypes.string, DataTypes.span],
             )

magic_client.register(ie_function=ner_wrapper,
           ie_function_name='NER',
           in_rel=[DataTypes.string],
           out_rel=[DataTypes.string, DataTypes.string, DataTypes.span],
           )



magic_client.register(ie_function=dependency_parse_wrapper,
                ie_function_name='DepParse',
                in_rel=[DataTypes.string],
                out_rel=[DataTypes.string, DataTypes.integer, DataTypes.string, DataTypes.integer, DataTypes.string]
                )




In [23]:
%%spanner
sentence="This is John. John fathered Greg wonderfully. Greg turned out ok"

#TODO multiline rule doesnt parse

father_of(F,S)<- NER(sentence)->(F,"PERSON",T1),NER(sentence)->(S,"PERSON",T2),Lemma(sentence)->(FATHERLIKE ,"father",T3),DepParse(sentence)->("nsubj",T41,FATHERLIKE,T42,F),DepParse(sentence)->("obj",T51,FATHERLIKE,T52,S)
#NER(sentence)->(F,"PERSON",T1)
#NER(sentence)->(S,"PERSON",T2)
#Lemma(sentence)->(FATHERLIKE ,"father",T3)
#DepParse(sentence)->("nsubj",T41,FATHERLIKE,T42,F)
#DepParse(sentence)->("obj",T51,FATHERLIKE,T52,S)


In [24]:
%%spanner

?father_of(F,S)

printing results for query 'father_of(F, S)':
  F   |  S
------+------
 John | Greg



# Open questions

## Could be other declerative frameworks

In [25]:
%%spanner-path
MATCH (F,*)-["nsubj"]->(FATHERLIKE)-["obj"]->(S)
WHERE Lemma(FATHERLIKE)="father"
AND ...
FROM DepParse(sentence)
RETURN(F,S)

UsageError: Cell magic `%%spanner-path` not found.


In [None]:
%%spanner-sql
SELECT (F,S)
FROM Sentence
WHERE NER("sentence")==(F,"PERSON",*)
AND ...
GROUPBY F

## User given constraints on ie function


In [None]:
%%spanner
gpa_str = "\n abigail 100\n jordan 80\n gale 79\n howard 60\n"

RGX(gpa_str,".*[\n](?<Student>[a-z]+).*(?<Grade>\d+).*[\n]")->(Student, Grade) 
# split by \n and run in parallel


In [None]:
magic_client.register(
                ie_function=f,
                in_rel=...
                out_rel=...
                splitable = "\n" 
                splitable_distance=50
                ...
                )

## optimization options

Extractors pass over the same data

In [None]:
%%spanner
father_of(F,S)<- 
NER(sentence)->(F,"PERSON",T1)
NER(sentence)->(S,"PERSON",T2)
Lemma(sentence)->(FATHERLIKE ,"father",T3)
DepParse(sentence)->("nsubj",T41,FATHERLIKE,T42,F)
DepParse(sentence)->("obj",T51,FATHERLIKE,T52,S)


How do we derive execution order from joins structure and extraction cost metadata?

In [None]:
%%spanner
text="...."

mentioned_together(P1,P2)<-
  InSameSentence(text)->(P1,P2),
  NER(P1,"PERSON"),
  NER(P2,"PERSON")


## Interface for adding optimizations

In [None]:
class Session:
    def __init__(self, debug=False):
        self._symbol_table = SymbolTable()
        self._term_graph = NetxTermGraph()
        self._execution = execution.PydatalogEngine(debug)

        self._pass_stack = [
            RemoveTokens,
            FixStrings,
            CheckReservedRelationNames,
            ConvertSpanNodesToSpanInstances,
            ConvertStatementsToStructuredNodes,
            CheckDefinedReferencedVariables,
            CheckReferencedRelationsExistenceAndArity,
            CheckReferencedIERelationsExistenceAndArity,
            CheckRuleSafety,
            TypeCheckAssignments,
            TypeCheckRelations,
            SaveDeclaredRelationsSchemas,
            ReorderRuleBody,
            ResolveVariablesReferences,
            ExecuteAssignments,
            AddStatementsToNetxTermGraph,
            GenericExecution
        ]

        grammar_file_path = os.path.dirname(rgxlog.grammar.__file__)
        grammar_file_name = 'grammar.lark'
        with open(f'{grammar_file_path}/{grammar_file_name}', 'r') as grammar_file:
            self._grammar = grammar_file.read()

        self._parser = Lark(self._grammar, parser='lalr', debug=True)
        self._register_default_functions()

* Add own passes for constraint checking
* Add optimizations as execution graph transform passes?
* Add callbacks to execution for recording metadata
* Add passes for optimizing execution graph based on metadata