In [1]:
# Boilerplate
import numpy as np
import pandas as pd
import pandas_text as pt

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# Initialize the spaCy deep parser
parser = spacy.load("en_core_web_sm")

# Parse a sentence
target_text = "The luxury auto maker last year sold 1,214 cars in the U.S."
token_features = pt.make_tokens_and_features(target_text, parser)
token_features

Unnamed: 0,token_num,char_span,token_span,lemma,pos,tag,dep,head_token_num,shape,is_alpha,is_stop,sentence
0,0,"[0, 3): 'The'","[0, 1): 'The'",the,DET,DT,det,3,Xxx,True,True,"[0, 12): 'The luxury auto maker last year sold..."
1,1,"[4, 10): 'luxury'","[1, 2): 'luxury'",luxury,NOUN,NN,compound,3,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
2,2,"[11, 15): 'auto'","[2, 3): 'auto'",auto,NOUN,NN,compound,3,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
3,3,"[16, 21): 'maker'","[3, 4): 'maker'",maker,NOUN,NN,nsubj,6,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
4,4,"[22, 26): 'last'","[4, 5): 'last'",last,ADJ,JJ,amod,5,xxxx,True,True,"[0, 12): 'The luxury auto maker last year sold..."
5,5,"[27, 31): 'year'","[5, 6): 'year'",year,NOUN,NN,npadvmod,6,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
6,6,"[32, 36): 'sold'","[6, 7): 'sold'",sell,VERB,VBD,ROOT,6,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
7,7,"[37, 42): '1,214'","[7, 8): '1,214'",1214,NUM,CD,nummod,8,"d,ddd",False,False,"[0, 12): 'The luxury auto maker last year sold..."
8,8,"[43, 47): 'cars'","[8, 9): 'cars'",car,NOUN,NNS,dobj,6,xxxx,True,False,"[0, 12): 'The luxury auto maker last year sold..."
9,9,"[48, 50): 'in'","[9, 10): 'in'",in,ADP,IN,prep,6,xx,True,True,"[0, 12): 'The luxury auto maker last year sold..."


In [2]:
# Show the dependency parse of the sentence
# Note that this is a bit different from the gold-standard parse tree.
pt.render_parse_tree(token_features)

Original rule is:
```yaml
-
  condition: { /node/pos: NOUN }
  actions:
    - function: strip_phrase
 #     input : { node: /node}
      params: { excludePos: '[ "DET", "ADJ" ]' }
      output : { strippedPhrase: /strippedSpan, strippedPhraseNF: /normalForm }
  outputs: [{ view: NounPhrase, id: /node/id, span: /node/span_of, 
          head: /node, headPOS: /node/pos, headNF: /node/lemma,
          determiner: /node/children/pos=DET,
          strippedPhrase: /strippedPhrase, strippedPhraseNF: /strippedPhraseNF }]     
```
Here's the source code for the `strip_phrase` UDF that this rule depends on:
```java
	private DataObj stripPhrase(Data inputObj, Map<String, List<String>> paramSet, Trace trace) {
		if (_trace) trace.push("getStringExcludePos");
		Node node = (Node) inputObj.get("node", trace);
		
		// If node already has required output, return it
		DataObj output = (DataObj) node.stringRecursive;
		if (output != null) {
			if (_trace) trace.pop();
			return output;
		}

		// Preprocess 
		List<String> excludePosList = paramSet.get("excludePos");
		Set<String> excludePos = new HashSet<>(excludePosList);
		
		DataObj result = stripPhrase(node, excludePos, trace);

		if (_trace) trace.pop();
		return result;
	}

	private DataObj stripPhrase(Node node, Set<String> excludePos, Trace trace) {
		if (_trace) trace.push("stringRecursive for " + node.id);
		List<Span> spans = new ArrayList<>();
		List<String> lemmas = new ArrayList<>();
		
		// From left
		for (Node child: node.children) {
			if (child.id < node.id ) {
				DataObj results = stripPhrase(child, excludePos, trace);
				spans.add((Span) results.get("strippedSpan"));
				lemmas.add(results.getBareString("normalForm"));
			}
		}
		
		// From self
		if (! excludePos.contains(node.pos)) {
			spans.add(node.getNodeSpan(trace));
			lemmas.add(node.lemma);
		}

		// Combine to form results
		Span span = Span.combine(spans, trace);
		String normalForm = StringUtils.join(lemmas, " ").trim();
		DataObj results = new DataObj();
		results.put("strippedSpan", span);
		results.put("normalForm", normalForm);
		node.stringRecursive = results;

		if (_trace) trace.pop();
		return results;
	}

```

English language translation of the above:
1. Start with every token tagged `NOUN`
1. Find every child of each `NOUN` token that is to the left of the `NOUN` and is not tagged with `DET` or `ADJ`
1. Recursively repeat the previous step until a fixed point is reached.
1. For each set of children, find the smallest span that covers all children. Return that span as the "stripped" span. Also return the set of children of the head noun tagged with DET.

In [3]:
# Wrap a Gremlin GraphTraversal around our token features DataFrame 
g = pt.token_features_to_traversal(token_features)

# The parts of the rule that naturally translate to Gremlin we do in Gremlin.
noun_phrase_traversal = (
    g.V()
    # 1. Start with every token tagged `NOUN`
    .has("pos", "NOUN").as_("head", "headPOS", "headNF")
    # 2. Find every child of each NOUN token that is to the left of the NOUN and is not tagged with DET or ADJ
    # 3. Recursively repeat the previous step until a fixed point is reached
    .repeat(pt.__.in_().has("pos", pt.Without("DET", "ADJ"))).emit().as_("child")
    
    .select("head", "headPOS", "headNF", "child").by("token_span").by("pos").by("lemma").by("token_span")
    .compute()
)
noun_phrase_df = noun_phrase_traversal.toDataFrame()
noun_phrase_df

Unnamed: 0,head,headPOS,headNF,child
0,"[3, 4): 'maker'",NOUN,maker,"[1, 2): 'luxury'"
1,"[3, 4): 'maker'",NOUN,maker,"[2, 3): 'auto'"
2,"[8, 9): 'cars'",NOUN,car,"[7, 8): '1,214'"


In [5]:
noun_phrase_df["child"] < noun_phrase_df["head"]

0    True
1    True
2    True
dtype: bool

In [None]:
noun_phrase_traversal.edges

In [None]:
print(pt.token_features_to_gremlin(token_features))

In [None]:
df = pd.DataFrame([[1,3], [2, 4]])
df

In [None]:
df.insert(0, 0, np.array([5, 6]))
df

In [None]:
df.insert(0, None, np.array([5, 6]))
df