# Named Entity Recognition.

spacy is used as key library

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

# 2)- NER

In [4]:
doc = nlp(u"Linus Benedict Torvalds is a Finnish-American software engineer who is the creator, and for a long time, principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS.")

In [5]:
doc

Linus Benedict Torvalds is a Finnish-American software engineer who is the creator, and for a long time, principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS.

In [6]:
for entity in doc.ents:
    print(entity.text,entity.label_)

Linus Benedict Torvalds PERSON
Finnish NORP
Linux PERSON
Linux PERSON
Chrome OS ORG


We can see Linus correct but, Linux is not a person. So, spacy does not give 100% correct.

In [7]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Linus Linus PROPN NNP compound Xxxxx True False
Benedict Benedict PROPN NNP compound Xxxxx True False
Torvalds Torvalds PROPN NNP nsubj Xxxxx True False
is be AUX VBZ ROOT xx True True
a a DET DT det x True True
Finnish finnish ADJ JJ amod Xxxxx True False
- - PUNCT HYPH punct - False False
American american ADJ JJ amod Xxxxx True False
software software NOUN NN compound xxxx True False
engineer engineer NOUN NN attr xxxx True False
who who PRON WP nsubj xxx True True
is be AUX VBZ relcl xx True True
the the DET DT det xxx True True
creator creator NOUN NN attr xxxx True False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
for for ADP IN prep xxx True True
a a DET DT det x True True
long long ADJ JJ amod xxxx True False
time time NOUN NN pobj xxxx True False
, , PUNCT , punct , False False
principal principal ADJ JJ amod xxxx True False
developer developer NOUN NN pobj xxxx True False
of of ADP IN prep xx True True
the the DET DT det xxx True True
Linux Linux PROPN N

- ext: The original word text.
- Lemma: The base form of the word.
- POS: The simple part-of-speech tag.
- Tag: The detailed part-of-speech tag.
- Dep: Syntactic dependency, i.e. the relation between tokens.
- Shape: The word shape – capitalization, punctuation, digits.
- is alpha: Is the token an alpha character?
- is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [8]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [9]:
spacy.explain('PROPN')

'proper noun'

In [10]:
 #Visualize With DiSplaCy
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)

# 3)- Custom NER

In [11]:
from spacy.pipeline import EntityRuler

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
ruler = EntityRuler(nlp)

**create custom pattern**

In [14]:
patterns = [{"label": "ORG", "pattern": "InnoLab"},
            {"label": "GPE", "pattern": [{"LOWER": "greater"}, {"LOWER": "berlin"}]}]

In [15]:
patterns

[{'label': 'ORG', 'pattern': 'InnoLab'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'greater'}, {'LOWER': 'berlin'}]}]

In [16]:
ruler.add_patterns(patterns)

In [17]:
nlp.add_pipe(ruler)

In [18]:
doc = nlp("InnoLab is a startup based in Greater Berlin region.")

In [19]:
doc

InnoLab is a startup based in Greater Berlin region.

In [20]:
for ent in doc.ents:
    print(ent.text, ent.label_)

InnoLab ORG
Greater Berlin GPE


In [21]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

InnoLab InnoLab PROPN NNP nsubj XxxxXxx True False
is be AUX VBZ ROOT xx True True
a a DET DT det x True True
startup startup NOUN NN attr xxxx True False
based base VERB VBN acl xxxx True False
in in ADP IN prep xx True True
Greater Greater PROPN NNP compound Xxxxx True False
Berlin Berlin PROPN NNP compound Xxxxx True False
region region NOUN NN pobj xxxx True False
. . PUNCT . punct . False False


In [22]:
displacy.render(doc,style='ent',jupyter=True)

### Adding Titles to names

In [23]:
doc = nlp('Dr. Jan Van Ande is new director of innovation at Rotterdam Business School')

In [24]:
doc

Dr. Jan Van Ande is new director of innovation at Rotterdam Business School

In [25]:
for entity in doc.ents:
    print(entity.text,entity.label_)

Jan Van Ande PERSON
Rotterdam Business School ORG


We didnt get title of Jan Van Ande and being Dutch, it might be an issue for him. Professor like titles as they have earned them so we need to modify our model.

In [26]:
def add_title(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.start!=0:
            prev_token = doc[ent.start-1]
            if prev_token.text in ('Dr', 'Dr.', 'Mr', 'Mr.'):
                new_ent = Span(doc, ent.start-1, ent.end, label=ent.label)
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [27]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(add_title, after='ner')

In [28]:
doc = nlp('Dr. Jan Van Ande is new director of innovation at Rotterdam Business School')

In [29]:
for entity in doc.ents:
    print(entity.text,entity.label_)

Dr. Jan Van Ande PERSON


In [30]:
doc = nlp('Mr. Jan Van Ande is new director of innovation at Rotterdam Business School')

In [31]:
for entity in doc.ents:
    print(entity.text,entity.label_)

Mr. Jan Van Ande PERSON
