In [4]:
import nltk
import spacy

nltk.download('wordnet')

# Load NER model
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package wordnet to /Users/yyh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# load in Questions/Answers
QaA = []

with open('QuestionsAndAnswers.txt', mode='r') as file:
    for line in file:
        q, a = line.strip().split(':/:')

        QaA.append({'question': q, 'answer': a})


# List of named entities
    PERSON:      People, including fictional.
    NORP:        Nationalities or religious or political groups.
    FAC:         Buildings, airports, highways, bridges, etc.
    ORG:         Companies, agencies, institutions, etc.
    GPE:         Countries, cities, states.
    LOC:         Non-GPE locations, mountain ranges, bodies of water.
    PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
    EVENT:       Named hurricanes, battles, wars, sports events, etc.
    WORK_OF_ART: Titles of books, songs, etc.
    LAW:         Named documents made into laws.
    LANGUAGE:    Any named language.
    DATE:        Absolute or relative dates or periods.
    TIME:        Times smaller than a day.
    PERCENT:     Percentage, including ”%“.
    MONEY:       Monetary values, including unit.
    QUANTITY:    Measurements, as of weight or distance.
    ORDINAL:     “first”, “second”, etc.
    CARDINAL:    Numerals that do not fall under another type.

In [6]:
NER = []

for qa in QaA:
    q_doc, a_doc = nlp(qa['question']), nlp(qa['answer'])
    
    dont_include_labels = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

    print([q_doc]+[(entity.text, entity.label_) for entity in q_doc.ents if entity.label_ not in dont_include_labels])
    print([a_doc]+[(entity.text, entity.label_) for entity in a_doc.ents if entity.label_ not in dont_include_labels])

[Is it correct that Berlin is the capital of Germany?, ('Berlin', 'GPE'), ('Germany', 'GPE')]
[surely it should be Bonn? Quote from: Snoopy on December 09, 2017, 12:3, ('Bonn', 'GPE'), ('Snoopy', 'PERSON')]
[Does Japan have the highest life expectancy in the world?, ('Japan', 'GPE')]
[февруари 30, 2017 A study by a team of researchers from the Institute for Health Metrics and E, ('the Institute for Health Metrics', 'ORG')]
[True or False: Brazil’s official language is Portuguese., ('Brazil', 'GPE'), ('Portuguese', 'NORP')]
[Hinweis: Diese Aussage ist falsch! True or False: It is possible to take a bus from Cuiaba to Brasilia. Hinweis:, ('Hinweis', 'PERSON'), ('Diese Aussage', 'NORP'), ('Cuiaba', 'GPE'), ('Brasilia', 'GPE'), ('Hinweis', 'PERSON')]
[Would it be accurate to say the Eiffel Tower is in Paris?, ('the Eiffel Tower', 'LOC'), ('Paris', 'GPE')]
[surely. Would it be acceptable to say that the Eiffel tower is in Paris, France or in Paris, France? What you have, ('Eiffel', 'LOC'), 

In [None]:
import requests

def generate_candidates_api(mention, language="en", limit=10):
    url = "https://www.wikidata.org/w/api.php"

    params = {
        "action": "wbsearchentities",
        "search": mention,
        "language": language,
        "format": "json",
        "limit": limit,
    }

    response = requests.get(url, params=params)
    data = response.json()
    
    candidates = []

    for entity in data.get("search", []):
        candidates.append(entity["id"])
    return candidates



def get_entity_info(id, languages='en'):

    url = "https://www.wikidata.org/w/api.php"

    params = {
        "action": "wbgetentities",
        "ids": id,
        "languages": languages,
        "format": "json",
    }

    response = requests.get(url, params=params)
    data = response.json()
    
    print(data)

    label = data['entities'][id]['labels']['en']['value']
    description = data['entities'][id]['descriptions']['en']['value']
    
    claims = len(data['entities'][id]['claims'].keys())
    sitelinks =  len(data['entities'][id]['sitelinks'].keys())
    
    url = ''

    if data['entities'][id]['sitelinks'].get('enwiki'):
        base_url = 'https://en.wikipedia.org/wiki/'
        url = base_url + data['entities'][id]['sitelinks']['enwiki']['title'].replace(' ', '_')

    return {'label': label, 'description': description, 'claims': claims, 'sitelinks': sitelinks, 'url': url}

In [8]:
query = "Homer"

candidates = generate_candidates_api(query)

for id in candidates:
    get_entity_info(id)
    break

{'entities': {'Q19826483': {'pageid': 21428480, 'ns': 0, 'title': 'Q19826483', 'lastrevid': 2254804851, 'modified': '2024-09-30T20:59:50Z', 'type': 'item', 'id': 'Q19826483', 'labels': {'en': {'language': 'en', 'value': 'Homer'}}, 'descriptions': {'en': {'language': 'en', 'value': 'male given name'}}, 'aliases': {'en': [{'language': 'en', 'value': 'Homer (given name)'}, {'language': 'en', 'value': 'Homer (first name)'}]}, 'claims': {'P31': [{'mainsnak': {'snaktype': 'value', 'property': 'P31', 'hash': '477ef35019874affa392a3f6d7c7062f10a6d132', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 12308941, 'id': 'Q12308941'}, 'type': 'wikibase-entityid'}, 'datatype': 'wikibase-item'}, 'type': 'statement', 'id': 'Q19826483$1c6b48be-4010-5fb1-fa74-7361d3152692', 'rank': 'normal'}], 'P1533': [{'mainsnak': {'snaktype': 'value', 'property': 'P1533', 'hash': '39adac6959180a96201a8c5120dbf82eebaa6f58', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 16276929, 'id': 'Q1627