In [2]:
from treelib import Node, Tree
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span
import pandas as pd
from sense2vec import Sense2VecComponent
nlp = spacy.load('en_core_web_lg')
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk("../../s2v_reddit_2019_lg")

<sense2vec.component.Sense2VecComponent at 0x7fde71ad53d0>

In [7]:
d = {'x': 1, 'y': 2}
import json
json.dumps(d, indent=2)

'{\n  "x": 1,\n  "y": 2\n}'

In [3]:
queries_df = pd.read_csv("../queries/query_20.tsv", sep="\t", header=None)
getq = lambda i: queries_df.loc[i, 1]
queries_df.head()

Unnamed: 0,0,1
0,0,I want a red car with heated seats.
1,1,I want a blue car with CarPlay.
2,2,I want an old and cheap car with emission volu...
3,3,I want an expensive car with all configurations.
4,4,I want a black car with navigation system and ...


In [4]:
from spacy.matcher import DependencyMatcher
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)
class FeatureExtractor:
    def __init__(self):
        self.dep_matcher = DependencyMatcher(vocab=nlp.vocab)
        main_patterns = [
                # [
                #     {'RIGHT_ID': 'p_subject', 'RIGHT_ATTRS': {'TEXT': 'car', 'DEP': 'dobj'}},
                #     {'LEFT_ID': 'p_subject', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_to_subject', 'RIGHT_ATTRS': {'DEP': 'prep'}},
                #     {'LEFT_ID': 'p_prep_to_subject', 'REL_OP': '>', 'RIGHT_ID': 'p_prep_object', 'RIGHT_ATTRS': {'DEP': {"REGEX": "\s*"}}}
                # ],
                [
                {'RIGHT_ID': 'p_object', 'RIGHT_ATTRS': {'DEP': {"IN": ['dobj', 'pobj']}}},
                {'LEFT_ID': 'p_object', 'REL_OP': '>', 'RIGHT_ID': 'p_object_mod', 'RIGHT_ATTRS': {'DEP': 'amod'}},
                ]
            ] # TODO: CONFIG HERE
        self.dep_matcher.add(f"main_patterns", patterns=main_patterns)
    
    @staticmethod
    def traverse(s, tree):
        for c in s.children:
            tree.create_node(c, c,  parent=s, data=c)
            tree = FeatureExtractor.traverse(c, tree=tree)
        return tree
    
    @staticmethod   
    def build_tree(query):
        doc = nlp(query)
        sent_tree = Tree()
        s = list(doc.sents)[0].root
        sent_tree.create_node(s, s, data=s)
        sent_tree = FeatureExtractor.traverse(s, sent_tree)
        return sent_tree
    
    @staticmethod  
    def build_tree_from_node(node):
        sent_tree = Tree()
        sent_tree.create_node(node, node, data=node)
        sent_tree = FeatureExtractor.traverse(node, sent_tree)
        return sent_tree
    
    @staticmethod
    def compile_secondary_patterns(main_subject=""):
        secondary_patterns = [
            [
                {'RIGHT_ID': 'p_object', 'RIGHT_ATTRS': {'DEP': {"IN": ['dobj', 'pobj']}}},
                {'LEFT_ID': 'p_object', 'REL_OP': '>', 'RIGHT_ID': 'p_object_mod', 'RIGHT_ATTRS': {'DEP': {"IN": ['amod', 'compound']}}},
            ],
            [
                {'RIGHT_ID': 'p_conj', 'RIGHT_ATTRS': {'DEP': 'conj'}},
                {'LEFT_ID': 'p_conj', 'REL_OP': '>', 'RIGHT_ID': 'p_conj_mod', 'RIGHT_ATTRS': {'DEP': {"IN": ['amod', 'advmod']}}},
            ]
        ]
        return secondary_patterns
    
    def extract(self, query):
        nlp.tokenizer = custom_tokenizer(nlp)
        doc = nlp(query)
        secondary_matcher = DependencyMatcher(vocab=nlp.vocab)
        secondary_matcher.add("secondary_patterns", FeatureExtractor.compile_secondary_patterns())
        dep_matches = secondary_matcher(doc)
        matches_str = []
        for match in dep_matches:
            matches = match[1]
            p_1, p_2 = matches[0], matches[1]
            # print(f"\t-> {doc[p_1]} {doc[p_2]}")
            matches_str.append(f"{doc[p_2]} {doc[p_1]}")
        return matches_str        
        

In [6]:
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load("en_core_web_lg")
doc = nlp("I want a blue car with strong horsepower.")
svg = displacy.render(doc, style="dep", jupyter=False)
file_name = "dependency.svg"
output_path = Path(f"{file_name}")
output_path.open("w", encoding="utf-8").write(svg)

6349

In [5]:
fe = FeatureExtractor()

In [6]:
fe.extract("I want a red car with heated seats.")

['red car', 'heated seats']

In [7]:
fe.extract("I want a red SUV with heated seats.")

['red SUV', 'heated seats']

In [8]:
fe.extract("I want a black car with wind shield.")

['black car', 'wind shield']

In [9]:
for i in range(20):
    print(f"extract {i}")
    print(fe.extract(getq(i)))

extract 0
['red car', 'heated seats']
extract 1
['blue car']
extract 2
['old car', 'emission volume']
extract 3
['expensive car']
extract 4
['black car', 'navigation system', 'moderate price']
extract 5
['high performance']
extract 6
['electric car', 'lowest price']
extract 7
['cheap car', 'V-8 car', 'engine car']
extract 8
['white car', 'rich technology']
extract 9
['diesel-driven car', 'high power', 'engine power']
extract 10
['small car', 'volume car']
extract 11
['Ford SUV']
extract 12
['red car', 'high configurations', 'safety configurations']
extract 13
['remotely start']
extract 14
['cost-effective car']
extract 15
['limited budget']
extract 16
['rich levels', 'gear levels', 'high power']
extract 17
['four-wheels-drive sedan', 'BMW sedan']
extract 18
['moderate car', 'volume car', 'moderate price']
extract 19
['good performance', 'snowy weather']


In [10]:
i = 8
for d in fe.extract(getq(i)):
    print(d)
    doc = nlp(d)
    print(doc[0::]._.s2v_most_similar(3))

white car
[(('silver car', 'NOUN'), 0.9035), (('black car', 'NOUN'), 0.8969), (('cammer', 'NOUN'), 0.8732)]
rich technology


ValueError: Can't find key rich_technology|NOUN in table

In [11]:
sent_trees = [
    FeatureExtractor.build_tree(getq(i)) for i in range(20)
]

In [12]:
sent_trees[13].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: keep, dep_: xcomp
│   ├── text: to, dep_: aux
│   ├── text: warm, dep_: acomp
│   ├── text: in, dep_: prep
│   │   └── text: winter, dep_: pobj
│   ├── text: and, dep_: cc
│   └── text: start, dep_: conj
│       ├── text: car, dep_: dobj
│       │   └── text: the, dep_: det
│       └── text: remotely, dep_: advmod
└── text: ., dep_: punct



In [13]:
sent_trees[0].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: red, dep_: amod
│   └── text: with, dep_: prep
│       └── text: seats, dep_: pobj
│           └── text: heated, dep_: amod
└── text: ., dep_: punct



In [14]:
sent_trees[1].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: blue, dep_: amod
│   └── text: with, dep_: prep
│       └── text: CarPlay, dep_: pobj
└── text: ., dep_: punct



In [15]:
sent_trees[2].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: an, dep_: det
│   ├── text: old, dep_: amod
│   │   ├── text: and, dep_: cc
│   │   └── text: cheap, dep_: conj
│   └── text: with, dep_: prep
│       └── text: volume, dep_: pobj
│           ├── text: emission, dep_: compound
│           └── text: of, dep_: prep
│               └── text: 3.0, dep_: pobj
└── text: ., dep_: punct



In [16]:
sent_trees[3].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: an, dep_: det
│   ├── text: expensive, dep_: amod
│   └── text: with, dep_: prep
│       └── text: configurations, dep_: pobj
│           └── text: all, dep_: det
└── text: ., dep_: punct



In [17]:
sent_trees[4].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   ├── text: black, dep_: amod
│   └── text: with, dep_: prep
│       └── text: system, dep_: pobj
│           ├── text: navigation, dep_: compound
│           ├── text: and, dep_: cc
│           └── text: price, dep_: conj
│               └── text: moderate, dep_: amod
└── text: ., dep_: punct



In [18]:
sent_trees[5].show(data_property=["text", "dep_"])

text: want, dep_: ROOT
├── text: I, dep_: nsubj
├── text: car, dep_: dobj
│   ├── text: a, dep_: det
│   └── text: with, dep_: prep
│       └── text: performance, dep_: pobj
│           └── text: high, dep_: amod
└── text: ., dep_: punct

