### Using a custom trained w2vec model with vecner

In [1]:
from vecner import ExactMatcher, ThresholdMatcher
from gensim.models import KeyedVectors

# loads the pre-trained Gensim model
model = KeyedVectors.load('../models/custom-w2vec.model')

# check that model was loaded properly
model.most_similar('pfs'), type(model)

([('mpfs', 0.7197115421295166),
  ('pfs/os', 0.6994421482086182),
  ('dor', 0.6904733180999756),
  ('os', 0.6577452421188354),
  ('survival', 0.6368557810783386),
  ('rpfs', 0.6007528305053711),
  ('idfs', 0.5724086761474609),
  ('pfs2', 0.5723546147346497),
  ('km', 0.5712378621101379),
  ('orr', 0.5679236054420471)],
 gensim.models.keyedvectors.KeyedVectors)

In [2]:
text = """
I am really amazed by the overall survival, 
however still concerned with 3-year treatment duration. 
Would be interesting to see the predictive biomarkers in NSCLC advanced patients.
"""

print(text)


I am really amazed by the overall survival, 
however still concerned with 3-year treatment duration. 
Would be interesting to see the predictive biomarkers in NSCLC advanced patients.



In [3]:
# custom defined lexicon
bio_lexicon = {
    'efficacy' : [
        'overall survival',
        'pfs'
    ],
    'diagnostics' : [
        'marker'
    ],
    'time' : [
        'year',
        'month'
    ],
    'patient groups' : [
        'squamous',
        'resectable'
    ]

}

# init the Exact Matcher for finding entities
# as exactly mentioned in the lexicon
matcher = ExactMatcher(
  bio_lexicon,
  spacy_model='en_core_web_sm'
)

# init the ThresholdMatcher which finds entities
# based on a cosine similarity threshold
thresholdmatcher = ThresholdMatcher(
    bio_lexicon,
    w2vec_model=model,
    in_pipeline=True,
    spacy_model='en_core_web_sm',
    chunking_method='noun_chunking',
    threshold = 0.55
)

In [4]:
# map exact entities
output = matcher.map(
    text = text
)

# use in pipeline to map inexact entities
output = thresholdmatcher.map(
    document = output['doc'],
    ents = output['ents'],
    ids = output['ids']
)

In [5]:
from spacy import displacy

example = {
    'text' : output['doc'].text,
    'ents' : output['ents']
}

html = displacy.render(
    example, 
    style = 'ent',
    manual = True,
    jupyter = True
)