In [28]:
from treelib import Node, Tree
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span
import pandas as pd
nlp = spacy.load('en_core_web_sm')

In [29]:
queries_df = pd.read_csv("../queries/query_20.tsv", sep="\t", header=None)
getq = lambda i: queries_df.loc[i, 1]
queries_df.head()

Unnamed: 0,0,1
0,0,I want a red car with heated seats.
1,1,I want a blue car with CarPlay.
2,2,I want an old and cheap car with emission volu...
3,3,I want an expensive car with all configurations.
4,4,I want a black car with navigation system and ...


In [116]:
from spacy.matcher import DependencyMatcher
from spacy.util import compile_infix_regex
from spacy.tokenizer import Tokenizer

def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)
    
class FeatureExtractor:
    def __init__(self):
        self.dep_matcher = DependencyMatcher(vocab=nlp.vocab)
        main_patterns = [
                # [
                #     {'RIGHT_ID': 'p_subject', 'RIGHT_ATTRS': {'TEXT': 'car', 'DEP': 'dobj'}},
                #     {'LEFT_ID': 'p_subject', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_to_subject', 'RIGHT_ATTRS': {'DEP': 'prep'}},
                #     {'LEFT_ID': 'p_prep_to_subject', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_object', 'RIGHT_ATTRS': {'DEP': {"REGEX": "\s*"}}}
                # ],
                [
                {'RIGHT_ID': 'p_object', 'RIGHT_ATTRS': {'DEP': {"IN": ['dobj', 'pobj']}}},
                {'LEFT_ID': 'p_object', 'REL_OP': '>', 'RIGHT_ID': 'p_object_mod', 'RIGHT_ATTRS': {'DEP': 'amod'}},
                ]
            ] # TODO: CONFIG HERE
        self.dep_matcher.add(f"main_patterns", patterns=main_patterns)
    
    @staticmethod
    def traverse(s, tree):
        for c in s.children:
            tree.create_node(c, c,  parent=s, data=c)
            tree = FeatureExtractor.traverse(c, tree=tree)
        return tree
    
    @staticmethod   
    def build_tree(query):
        doc = nlp(query)
        sent_tree = Tree()
        s = list(doc.sents)[0].root
        sent_tree.create_node(s, s, data=s)
        sent_tree = FeatureExtractor.traverse(s, sent_tree)
        return sent_tree
    
    @staticmethod  
    def build_tree_from_node(node):
        sent_tree = Tree()
        sent_tree.create_node(node, node, data=node)
        sent_tree = FeatureExtractor.traverse(node, sent_tree)
        return sent_tree
    
    @staticmethod
    def compile_secondary_patterns(main_subject=""):
        secondary_patterns = [
            [
                {'RIGHT_ID': 'p_object', 'RIGHT_ATTRS': {'DEP': {"IN": ['dobj', 'pobj']}}},
                {'LEFT_ID': 'p_object', 'REL_OP': '>', 'RIGHT_ID': 'p_object_mod', 'RIGHT_ATTRS': {'DEP': {"IN": ['amod', 'compound']}}},
            ],
            [
                {'RIGHT_ID': 'p_conj', 'RIGHT_ATTRS': {'DEP': 'conj'}},
                {'LEFT_ID': 'p_conj', 'REL_OP': '>', 'RIGHT_ID': 'p_conj_mod', 'RIGHT_ATTRS': {'DEP': {"IN": ['amod', 'advmod']}}},
            ]
        ]
        return secondary_patterns
    
    def extract(self, query):
        nlp.tokenizer = custom_tokenizer(nlp)
        doc = nlp(query)
        # dep_matches = self.dep_matcher(doc)
        # for match in dep_matches:
        #     matches = match[1]
        #     p_subject, p_prep_to_subject, p_prep_object = matches[0], matches[1], matches[2]
        #     print(f"{doc}\n\t{doc[p_subject]}: {doc[p_prep_object]}")
        #     for s in doc.sents:
        #         st0 = FeatureExtractor.build_tree_from_node(s.root)
        #         st0.show(data_property=["text", "dep_"])
        secondary_matcher = DependencyMatcher(vocab=nlp.vocab)
        secondary_matcher.add("secondary_patterns", FeatureExtractor.compile_secondary_patterns())
        dep_matches = secondary_matcher(doc)
        print(f"{doc}")
        for match in dep_matches:
            matches = match[1]
            p_1, p_2 = matches[0], matches[1]
            print(f"\t-> {doc[p_1]} {doc[p_2]}")
            
        

In [117]:
fe = FeatureExtractor()

In [118]:
fe.extract("I want a red car with heated seats.")

I want a red car with heated seats.
	-> car red
	-> seats heated


In [119]:
fe.extract("I want a red SUV with heated seats.")

I want a red SUV with heated seats.
	-> SUV red
	-> seats heated


In [120]:
fe.extract("I want a black car with wind shield.")

I want a black car with wind shield.
	-> car black
	-> shield wind


In [121]:
for i in range(20):
    print(f"extract {i}")
    fe.extract(getq(i))

extract 0
I want a red car with heated seats.
	-> car red
	-> seats heated
extract 1
I want a blue car with CarPlay.
	-> car blue
extract 2
I want an old and cheap car with emission volume of 3.0.
	-> car old
	-> volume emission
extract 3
I want an expensive car with all configurations.
	-> car expensive
extract 4
I want a black car with navigation system and moderate price.
	-> car black
	-> system navigation
	-> price moderate
extract 5
I want a car with high performance.
	-> performance high
extract 6
I want a Tesla electric car at lowest price.
	-> car electric
	-> price lowest
extract 7
I want a cheap V-8 engine car.
	-> car cheap
	-> car V-8
	-> car engine
extract 8
I want a white car with rich technology.
	-> car white
	-> technology rich
extract 9
I want a diesel-driven car with high engine power.
	-> car diesel-driven
	-> power high
	-> power engine
extract 10
I want a small emission volume car with CarPlay.
	-> car small
	-> car volume
extract 11
I want a light-color 2018 For

In [55]:
sent_trees = [
    FeatureExtractor.build_tree(getq(i)) for i in range(20)
]

In [64]:
sent_trees[13].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: keep, dep_: xcomp
│   ├── text: to, dep_: aux
│   ├── text: warm, dep_: oprd
│   ├── text: in, dep_: prep
│   │   └── text: winter, dep_: pobj
│   ├── text: and, dep_: cc
│   └── text: start, dep_: conj
│       ├── text: car, dep_: dobj
│       │   └── text: the, dep_: det
│       └── text: remotely, dep_: advmod
└── text: ., dep_: punct



In [56]:
sent_trees[0].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: red, dep_: amod
│   └── text: with, dep_: prep
│       └── text: seats, dep_: pobj
│           └── text: heated, dep_: amod
└── text: ., dep_: punct



In [33]:
sent_trees[1].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: blue, dep_: amod
│   └── text: with, dep_: prep
│       └── text: CarPlay, dep_: pobj
└── text: ., dep_: punct



In [34]:
sent_trees[2].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: an, dep_: det
│   ├── text: old, dep_: amod
│   │   ├── text: and, dep_: cc
│   │   └── text: cheap, dep_: conj
│   └── text: with, dep_: prep
│       └── text: volume, dep_: pobj
│           ├── text: emission, dep_: compound
│           └── text: of, dep_: prep
│               └── text: 3.0, dep_: pobj
└── text: ., dep_: punct



In [35]:
sent_trees[3].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: an, dep_: det
│   ├── text: expensive, dep_: amod
│   └── text: with, dep_: prep
│       └── text: configurations, dep_: pobj
│           └── text: all, dep_: det
└── text: ., dep_: punct



In [36]:
sent_trees[4].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: black, dep_: amod
│   └── text: with, dep_: prep
│       └── text: system, dep_: pobj
│           ├── text: navigation, dep_: compound
│           ├── text: and, dep_: cc
│           └── text: price, dep_: conj
│               └── text: moderate, dep_: amod
└── text: ., dep_: punct



In [37]:
sent_trees[5].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   └── text: with, dep_: prep
│       └── text: performance, dep_: pobj
│           └── text: high, dep_: amod
└── text: ., dep_: punct



In [45]:
dep_matcher = DependencyMatcher(vocab=nlp.vocab)
dep_pattern = [
    {'RIGHT_ID': 'p_object', 'RIGHT_ATTRS': {'DEP': 'dobj'}},
    {'LEFT_ID': 'p_object', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_to_object', 'RIGHT_ATTRS': {'DEP': 'prep'}},
    {'LEFT_ID': 'p_prep_to_object', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_subject', 'RIGHT_ATTRS': {'DEP': {"REGEX": "\s*"}}}
]
dep_matcher.add('object_descpt', patterns=[dep_pattern])
dep_matches = dep_matcher(doc)
dep_matches

[(10529335708013777914, [4, 5, 7])]

In [48]:
for match in dep_matches:
    pattern_name = match[0]
    matches = match[1]
    p_object, p_prep_to_object, p_prep_subject = matches[0], matches[1], matches[2]
    print(f"{doc}\n\t{doc[p_object]}: {doc[p_prep_subject]}")

I want a black car with navigation system and moderate price.
	car: <generator object at 0x7fc1d8c95cb0>
