In [4]:
# imports and load spacy english language package
import spacy
from spacy import displacy
from spacy import tokenizer
nlp = spacy.load('en_core_web_lg')

# Load the text and process it
# I copied the text from python wiki
text =("Nikola Tesla (Serbian Cyrillic: Никола Тесла) was a Serbian-American inventor, electrical engineer, mechanical engineer, and futurist best known for his contributions to the design of the modern alternating current (AC) electricity supply system.")

doc = nlp(text)

sentences = list(doc.sents)
# print(sentences)
# tokenization
# for token in doc:
#     print(token.text)
# print entities
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)
# displacy allows to see the text in a user-friendly form with tagged entities
print(doc.ents)
displacy.render(doc, style='ent', jupyter=True)

[('Nikola Tesla', 0, 12, 'PERSON'), ('Serbian', 14, 21, 'NORP'), ('Тесла', 39, 44, 'PERSON'), ('Serbian', 52, 59, 'NORP')]
(Nikola Tesla, Serbian, Тесла, Serbian)


In [6]:
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [7]:
# SPACY entity types dictionary: the entity type, and it's description
spacy_entity_types_dict = {
    "PERSON": "People, including fictional.",
    "NORP": "Nationalities or religious or political groups.",
    "FAC": "Buildings, airports, highways, bridges, etc.",
    "ORG": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states.",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water.",
    "PRODUCT": "Objects, vehicles, foods, etc. (Not services.)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws.",
    "LANGUAGE": "Any named language.",
    "DATE": "Absolute or relative dates or periods.",
    "TIME": "Times smaller than a day.",
    "PERCENT": "Percentage, including '%'.",
    "MONEY": "Monetary values, including unit.",
    "QUANTITY": "Measurements, as of weight or distance.",
    "ORDINAL": "'first', 'second', etc.",
    "CARDINAL": "Numerals that do not fall under another type."
}



In [8]:
# Entity types: agent, timeperiod, place. Stored as (entity_type, classification)
dbpedia_ontology_classes_3 = {
    "PERSON": "Agent",
    "NORP": "Agent",
    "FAC": "Place",
    "ORG": "Agent",
    "GPE": "Place",
    "LOC": "Place",
    "PRODUCT": "Agent",
    "EVENT": "Agent",
    "WORK_OF_ART": "Agent",
    "LAW": "Agent",
    "LANGUAGE": "Agent",
    "DATE": "Timeperiod",
    "TIME": "Timeperiod",
    "PERCENT": "Agent",
    "MONEY": "Agent",
    "QUANTITY": "Agent",
    "ORDINAL": "Agent",
    "CARDINAL": "Agent"
}


# agent, timeperiod, place, work, personFunction

# agent, timeperiod, place, work, personFunction, Species, Event







['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [2]:
import spacy
import spacy_dbpedia_spotlight
# load your model as usual
nlp = spacy.load('en_core_web_lg')
# add the pipeline stage
nlp.add_pipe('dbpedia_spotlight')
# get the document
doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')
# see the entities
print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
# inspect the raw data from DBpedia spotlight
print(doc.ents[0]._.dbpedia_raw_result)

Entities [('USA', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/United_States'), ('Boris Johnson', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/Boris_Johnson'), ('coronavirus', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/Coronavirus')]
{'@URI': 'http://dbpedia.org/resource/United_States', '@support': '553243', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'USA', '@offset': '17', '@similarityScore': '0.9965798615743495', '@percentageOfSecondRank': '0.0023641208721890935'}


In [None]:
# Thing, Agent, Place, Work
first_level_mapping = {
    "PERSON": "Agent",
    "NORP": "Agent",
    "FAC": "Place",
    "ORG": "Work",
    "GPE": "Place",
    "LOC": "Place",
    "PRODUCT": "Work",
    "EVENT": "Thing",
    "WORK_OF_ART": "Agent",
    "LAW": "Work",
    "LANGUAGE": "Place",
    "DATE": "Thing",
    "TIME": "Thing",
    "PERCENT": "Thing",
    "MONEY": "Thing",
    "QUANTITY": "Thing",
    "ORDINAL": "Thing",
    "CARDINAL": "Thing"
}

# Thing, Agent, Place, Work, Person, Organisation, Event
second_level_mapping = {
    "PERSON": "Person",
    "NORP": "Agent",
    "FAC": "Place",
    "ORG": "Organisation",
    "GPE": "Place",
    "LOC": "Place",
    "PRODUCT": "Work",
    "EVENT": "Event",
    "WORK_OF_ART": "Agent",
    "LAW": "Work",
    "LANGUAGE": "Place",
    "DATE": "Thing",
    "TIME": "Thing",
    "PERCENT": "Thing",
    "MONEY": "Thing",
    "QUANTITY": "Thing",
    "ORDINAL": "Thing",
    "CARDINAL": "Thing"
}

# Thing, Agent, Place, Work, Person, Organisation, Event, TimePeriod, TopicalConcept,
second_level_mapping = {
    "PERSON": "Person",
    "NORP": "Agent",
    "FAC": "Place",
    "ORG": "Organisation",
    "GPE": "Place",
    "LOC": "Place",
    "PRODUCT": "Work",
    "EVENT": "Event",
    "WORK_OF_ART": "Agent",
    "LAW": "Work",
    "LANGUAGE": "Place",
    "DATE": "TimePeriod",
    "TIME": "TimePeriod",
    "PERCENT": "Thing",
    "MONEY": "Thing",
    "QUANTITY": "Thing",
    "ORDINAL": "TopicalConcept",
    "CARDINAL": "TopicalConcept"
}