### Using a w2vec model, trained on general corpora, with vecner

In [1]:
from vecner import ExactMatcher, ExtendedMatcher
import gensim.downloader as api

api.info().keys()

dict_keys(['corpora', 'models'])

In [2]:
# loads the pretrained on general corpora model
model = api.load("glove-wiki-gigaword-100")

# sense check and test
model.most_similar('food')

[('foods', 0.7469059824943542),
 ('supplies', 0.7264691591262817),
 ('products', 0.7225049138069153),
 ('meat', 0.7138239145278931),
 ('supply', 0.6732637882232666),
 ('feed', 0.670415461063385),
 ('medicines', 0.6687098145484924),
 ('meals', 0.6630423069000244),
 ('coffee', 0.6627735495567322),
 ('goods', 0.6610530614852905)]

In [3]:
text = """
The burger had absolutely no flavor, 
the place itself was totally dirty, 
the burger was overcooked and the staff incredibly rude.
"""

print(text)


The burger had absolutely no flavor, 
the place itself was totally dirty, 
the burger was overcooked and the staff incredibly rude.



In [4]:
# custom defined lexicon
food_lexicon = {
    'service' : [
        'rude',
        'service',
        'friendly'
    ],
    'general' : [
        'clean',
        'dirty',
        'decoration',
        'atmosphere'
    ],
    'food' : [
        'warm',
        'cold',
        'flavor',
        'tasty',
        'stale',
        'disgusting',
        'delicious'
    ]
}

# init the exact matcher to not miss
# any entities from the lexicon if in text
matcher = ExactMatcher(
    food_lexicon,
    spacy_model     = 'en_core_web_sm'
)

# init the Extended Matcher, which expands the lexicon
# using the w2vec model based on similar terms
# and then matches them in the sequence
extendedmatcher = ExtendedMatcher(
    food_lexicon,
    w2vec_model     = model,
    in_pipeline     = True,
    spacy_model     = 'en_core_web_sm',
    chunking_method = 'edge_chunking',
    sensitivity     = 20
)

In [5]:
# exact mapping
output = matcher.map(
    text = text
)

# extended matching mapping
output = extendedmatcher.map(
    document = output['doc'],
    ents = output['ents'],
    ids = output['ids']
)

In [6]:
from spacy import displacy

example = {
    'text' : output['doc'].text,
    'ents' : output['ents']
}

displacy.render(
    example,
    style = 'ent',
    manual = True
)