# Translating ESSP rules to Gremlin

In [1]:
# Boilerplate
import numpy as np
import pandas as pd
import pandas_text as pt

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# Initialize the spaCy deep parser
parser = spacy.load("en_core_web_sm")

## Adjectives
Original rules:
```yaml
-
  condition: { /node/pos: ADJ }
  actions:
    -
     output:  { form: normal, start: /node, head: /node, span: /node/span_of }
     subcases:
       -
         condition: { /node/left/form: more }
         outputs: [{ form: comparative, start: /node/left, head: /node, span: /node/span_of }]
       -
         condition: { /node/left/form: most }
         outputs: [{ form: superlative, start: /node/left, head: /node, span: /node/span_of }]
       - 
         condition: { /node/feats/Degree: Cmp }
         outputs: [{ form: comparative, start: /node, head: /node, span: /node/span_of }]
       -
         condition: { /node/feats/Degree: Sup }
         outputs: [{ form: superlative, start: /node, head: /node, span: /node/span_of }]
  outputs: [{ view: Adjective, id: /node/id, form: /form, start: /start, head: /node, span: /node/span_of, headNF: /node/lemma, noun: /node/parent }]
```
English translation:
1. Start with all nodes with the part of speech tag "ADJ". Use this node to fill the "head" and "span" fields of the result.
1. Populate the output fields "form", "start", and "span" as follows:
   * **Case 1: The token to the left of the head is "more".** Set the "form" output to "comparative", set "start" to the token to the left of the head.
   * **Case 2: The token to the left of hte head is "most".** "form" ==> "superlative", "start" ==> token to left of head
   * **Case 3: The head's degree is "Cmp" ("JJR" tag in SpaCy output).** "form" ==> "comparative", "start" ==> head
   * **Case 4: The head's degree is "Sup" ("JJS" tag in SpaCy output).** "form" ==> "superlative", "start" ==> head
   * **Case 5: None of the previous conditions hold.** "form" ==> "normal", "start" => head
1. Also return the lemmatized form of the head as "headNF" and the parent of the head as "noun"

In [13]:
target_text = """
This is a big red balloon. 
That is a bigger one.
This is the biggest sentence.
This is a more big sentence.
It is the most beautiful.
This big house is bright."""

# Parse the text with SpaCy
token_features = pt.make_tokens_and_features(target_text, parser, add_left_and_right=True)
token_features.loc[0:15]

Unnamed: 0,token_num,char_span,token_span,lemma,pos,tag,dep,head,shape,is_alpha,is_stop,sentence,left,right
0,0,"[0, 1): ' '","[0, 1): ' '",\n,SPACE,_SP,,1,\n,False,False,"[0, 9): ' This is a big red balloon. '",,1
1,1,"[1, 5): 'This'","[1, 2): 'This'",this,DET,DT,nsubj,2,Xxxx,True,True,"[0, 9): ' This is a big red balloon. '",0.0,2
2,2,"[6, 8): 'is'","[2, 3): 'is'",be,AUX,VBZ,ROOT,2,xx,True,True,"[0, 9): ' This is a big red balloon. '",1.0,3
3,3,"[9, 10): 'a'","[3, 4): 'a'",a,DET,DT,det,6,x,True,True,"[0, 9): ' This is a big red balloon. '",2.0,4
4,4,"[11, 14): 'big'","[4, 5): 'big'",big,ADJ,JJ,amod,6,xxx,True,False,"[0, 9): ' This is a big red balloon. '",3.0,5
5,5,"[15, 18): 'red'","[5, 6): 'red'",red,ADJ,JJ,amod,6,xxx,True,False,"[0, 9): ' This is a big red balloon. '",4.0,6
6,6,"[19, 26): 'balloon'","[6, 7): 'balloon'",balloon,NOUN,NN,attr,2,xxxx,True,False,"[0, 9): ' This is a big red balloon. '",5.0,7
7,7,"[26, 27): '.'","[7, 8): '.'",.,PUNCT,.,punct,2,.,False,False,"[0, 9): ' This is a big red balloon. '",6.0,8
8,8,"[28, 29): ' '","[8, 9): ' '",\n,SPACE,_SP,,7,\n,False,False,"[0, 9): ' This is a big red balloon. '",7.0,9
9,9,"[29, 33): 'That'","[9, 10): 'That'",that,DET,DT,nsubj,10,Xxxx,True,True,"[9, 16): 'That is a bigger one. '",8.0,10


In [39]:
g = pt.token_features_to_traversal(token_features)
adj_traversal = (
    g.V()
    # Start with all nodes with the part of speech tag "ADJ". 
    # Use this node to fill the "head", "headNF" and "span" fields of the result.
    .has("pos", pt.Within("ADJ")).as_("head", "headNF", "span")
    # Populate "form" field
    .coalesce( # Cases 1-5 described above
        pt.__.out("left").has("lemma", "more").constant("comparative"),
        pt.__.out("left").has("lemma", "most").constant("superlative"),
        pt.__.has("tag", "JJR").constant("comparative"),
        pt.__.has("tag", "JJS").constant("superlative"),
        pt.__.constant("normal")
    ).as_("form")
    # Populate "start" field
    .select("head")
    .coalesce(
        pt.__.out("left").has("lemma", "more"),  # Case 1 above
        pt.__.out("left").has("lemma", "most"),  # Case 2
        pt.__.select("head")                     # Cases 3-5
    )
    # Populate "noun" field (outer join)
    .select("head")
    .coalesce(
        pt.__.out("head").values("token_span"),
        pt.__.constant(None)
    )
    .as_("noun")
    .select("head", "headNF", "span", "form", "noun")
        .by("token_span").by("lemma").by("token_span").by().by()
).compute()
adj_traversal.toDataFrame()

Unnamed: 0,head,headNF,span,form,noun
0,"[27, 28): 'big'",big,"[27, 28): 'big'",comparative,"[28, 29): 'sentence'"
1,"[35, 36): 'beautiful'",beautiful,"[35, 36): 'beautiful'",superlative,"[32, 33): 'is'"
2,"[12, 13): 'bigger'",big,"[12, 13): 'bigger'",comparative,"[13, 14): 'one'"
3,"[19, 20): 'biggest'",big,"[19, 20): 'biggest'",superlative,"[20, 21): 'sentence'"
4,"[4, 5): 'big'",big,"[4, 5): 'big'",normal,"[6, 7): 'balloon'"
5,"[5, 6): 'red'",red,"[5, 6): 'red'",normal,"[6, 7): 'balloon'"
6,"[39, 40): 'big'",big,"[39, 40): 'big'",normal,"[40, 41): 'house'"
7,"[42, 43): 'bright'",bright,"[42, 43): 'bright'",normal,"[41, 42): 'is'"


## Conditionals
Original rules:
```yaml
-
  condition : { /node/lower : [ if ] }
  actions:
    - function: subtract
      input : { a: /node/parent, b: /node }
      output : { antecedent: /c }
    - function: subtract
      input : { a: /node/parent/parent, b: /node/parent }
      output : { consequent: /c }
  outputs : [{ view: Conditional, id: /node/id, type: /node/lemma, antecedent : /antecedent, consequent: /consequent, span: /node/span_of }]
  
-
  condition : { /node/lower : [ when, whenever ] }
  actions:
    - function: subtract
      input : { a: /node/parent, b: /node }
      output : { consequent: /c }
  outputs : [{ view: Conditional, id: /node/id, type: /node/lemma, antecedent : /node/children/pos=VERB/span_of, consequent: /consequent, span: /node/span_of }]
-
  condition : { /node/lower : [ unless, then, in case ] }
  actions:
    - function: subtract
      input : { a: /node/parent, b: /node }
      output : { antecedent: /c }
    - function: subtract
      input : { a: /node/parent/parent, b: /node/parent }
      output : { consequent: /c }
  outputs : [{ view: Conditional, id: /node/id, type: /node/lemma, antecedent : /antecedent, consequent: /consequent, span: /node/span_of }]
```

In [None]:
target_text = """
I will do it if you ask me.
This happens whenever we hear the noise.
This is true as long as we stick to the point.
It will continue to move forward unless stopped by an external force.
Keep monitoring the gauge in case it exceeds threshold.
If you ask, I will answer.
If computer has any damage, it will need to be repaired.
If computer has any damage, that issue will need to be resolved.
If your 13-inch MacBook Pro has any damage which impairs the replacement of the battery, that issue will need to be resolved prior to the battery replacement.
Note: If your 13-inch MacBook Pro has any damage which impairs the replacement of the battery, that issue will need to be resolved prior to the battery replacement.
If I were the man that owned that car, I would have been angry."""

# Parse the text with SpaCy
token_features = pt.make_tokens_and_features(target_text, parser)

# Add a field "lower" with the lowercase form of each token
token_features["lower"] = np.char.lower(token_features["token_span"].values.covered_text)

# Build a dataframe of sentences
sentences = pd.DataFrame({"sentence": token_features["sentence"].unique()})

# Show the first sentence's data
token_features[token_features["sentence"] == sentences["sentence"].loc[0]]

Rule 1:
```yaml
-
  condition : { /node/lower : [ if ] }
  actions:
    - function: subtract
      input : { a: /node/parent, b: /node }
      output : { antecedent: /c }
    - function: subtract
      input : { a: /node/parent/parent, b: /node/parent }
      output : { consequent: /c }
  outputs : [{ view: Conditional, id: /node/id, type: /node/lemma, antecedent : /antecedent, consequent: /consequent, span: /node/span_of }]
```
Source code of the `subtract()` built-in function:
```java

English translation:
1. Start with all tokens whose lowercase form is "if"
1. 

In [None]:
# Use the first example sentence 
first_sentence = token_features[token_features["sentence"] == sentences["sentence"].loc[0]]

g = pt.token_features_to_traversal(first_sentence)

first_sentence

In [None]:
g.V().toList()

Original rule is:
```yaml
-
  condition: { /node/pos: NOUN }
  actions:
    - function: strip_phrase
 #     input : { node: /node}
      params: { excludePos: '[ "DET", "ADJ" ]' }
      output : { strippedPhrase: /strippedSpan, strippedPhraseNF: /normalForm }
  outputs: [{ view: NounPhrase, id: /node/id, span: /node/span_of, 
          head: /node, headPOS: /node/pos, headNF: /node/lemma,
          determiner: /node/children/pos=DET,
          strippedPhrase: /strippedPhrase, strippedPhraseNF: /strippedPhraseNF }]     
```
Here's the source code for the `strip_phrase` UDF that this rule depends on:
```java
	private DataObj stripPhrase(Data inputObj, Map<String, List<String>> paramSet, Trace trace) {
		if (_trace) trace.push("getStringExcludePos");
		Node node = (Node) inputObj.get("node", trace);
		
		// If node already has required output, return it
		DataObj output = (DataObj) node.stringRecursive;
		if (output != null) {
			if (_trace) trace.pop();
			return output;
		}

		// Preprocess 
		List<String> excludePosList = paramSet.get("excludePos");
		Set<String> excludePos = new HashSet<>(excludePosList);
		
		DataObj result = stripPhrase(node, excludePos, trace);

		if (_trace) trace.pop();
		return result;
	}

	private DataObj stripPhrase(Node node, Set<String> excludePos, Trace trace) {
		if (_trace) trace.push("stringRecursive for " + node.id);
		List<Span> spans = new ArrayList<>();
		List<String> lemmas = new ArrayList<>();
		
		// From left
		for (Node child: node.children) {
			if (child.id < node.id ) {
				DataObj results = stripPhrase(child, excludePos, trace);
				spans.add((Span) results.get("strippedSpan"));
				lemmas.add(results.getBareString("normalForm"));
			}
		}
		
		// From self
		if (! excludePos.contains(node.pos)) {
			spans.add(node.getNodeSpan(trace));
			lemmas.add(node.lemma);
		}

		// Combine to form results
		Span span = Span.combine(spans, trace);
		String normalForm = StringUtils.join(lemmas, " ").trim();
		DataObj results = new DataObj();
		results.put("strippedSpan", span);
		results.put("normalForm", normalForm);
		node.stringRecursive = results;

		if (_trace) trace.pop();
		return results;
	}

```

English language translation of the above:
1. Start with every token tagged `NOUN`
1. Find every child of each `NOUN` token that is to the left of the `NOUN` and is not tagged with `DET` or `ADJ`
1. Recursively repeat the previous step until a fixed point is reached.
1. Also return the set of children of the head noun tagged with DET as the "determiner".  Leave the "determiner" field blank of no such children are present.
1. For each set of children, find the smallest span that covers all children. Return that span as the "stripped" span.


In [None]:
# Parse a sentence
target_text = "The luxury auto maker bot last year sold 1,214 cars in the U.S."
token_features = pt.make_tokens_and_features(target_text, parser)

# Wrap a Gremlin GraphTraversal around our token features DataFrame 
g = pt.token_features_to_traversal(token_features)

# The parts of the rule that naturally translate to Gremlin we do in Gremlin.
noun_phrase_traversal = (
    g.V()
    # 1. Start with every token tagged `NOUN`
    .has("pos", "NOUN").as_("head", "headPOS", "headNF")
    # 2. Find every child of each NOUN token that is to the left of the NOUN and is not 
    #    tagged with DET or ADJ
    # 3. Recursively repeat the previous step until a fixed point is reached
    .repeat(pt.__.in_()
            .where(pt.lt("head")).by("token_span")
            .has("pos", pt.without("DET", "ADJ"))).emit().as_("child")
    # 4. Also return the set of children of the head noun tagged with DET as the 
    #   "determiner" field.  Leave the field blank if no such children are present.
    .coalesce(
        pt.__.select("head").in_().has("pos", "DET").values("token_span"),
        pt.__.constant(None)).as_("determiner")
    .select("head", "headPOS", "headNF", "child", "determiner")
        .by("token_span").by("pos").by("lemma").by("token_span").by()
    .compute()
)
# The aggregation and formatting parts of the rule are in Pandas.
# 5. For each set of children, find the smallest span that covers all children. 
#    Return that span as the "stripped" span.
noun_phrase_df = (noun_phrase_traversal
                  .toDataFrame()
                  .groupby(["head"]).aggregate({"headPOS": "first", "headNF": "first", 
                                                "determiner": "first",
                                                "child": pt.combine_agg})
                  .reset_index())
noun_phrase_df["strippedSpan"] = pt.combine_spans(noun_phrase_df["child"], noun_phrase_df["head"])
noun_phrase_df["normalForm"] = pt.lemmatize(noun_phrase_df["strippedSpan"], token_features)
noun_phrase_df

In [None]:
noun_phrase_traversal = (
    g.V()
     # 1. Start with every token tagged `NOUN`
     .has("pos", "NOUN").as_("head", "headPOS", "headNF")
     # 2. Find every child of each NOUN token that is to the left of the NOUN and is not 
     #    tagged with DET or ADJ
     # 3. Recursively repeat the previous step until a fixed point is reached
     .repeat(pt.__.in_()
             .where(pt.lt("head")).by("token_span")
             .has("pos", pt.without("DET", "ADJ"))).emit().as_("child")
#     # 4. Also return the set of children of the head noun tagged with DET as the 
#     #   "determiner" field.  Leave the field blank if no such children are present.
#     .coalesce(
#         pt.__.select("head").in_().has("pos", "DET").values("token_span"),
#         pt.__.constant(None)).as_("determiner")
#     .select("head", "headPOS", "headNF", "child", "determiner")
#         .by("token_span").by("pos").by("lemma").by("token_span").by()
     .compute()
)

In [None]:
noun_phrase_traversal.paths

In [None]:
noun_phrase_traversal.aliases

In [None]:
print(pt.token_features_to_gremlin(token_features, include_begin_and_end=True))