<a href="https://colab.research.google.com/github/Danzigerrr/MultiClass-Entity-Linking-System/blob/NER-datasets/NED_simple_NED_with_DBpedia_Wikidata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests

In [2]:
# NER output from Flair (mocked here for demonstration purposes)
sentence = "Notre Dame, the iconic medieval cathedral in Paris, reopens after five years of speedy reconstruction work."
ner_spans = [
    {"text": "Notre Dame", "type": "FAC", "score": 1.0000},
    {"text": "Paris", "type": "GPE", "score": 1.0000},
    {"text": "five years", "type": "DATE", "score": 1.0000},
]

## DBpedia


In [3]:
ontonotes_to_dbpedia_mapping = [
    {"OntoNotes": "CARDINAL", "DBpedia": ["Identifier"]},
    {"OntoNotes": "DATE", "DBpedia": ["TimePeriod"]},
    {"OntoNotes": "EVENT", "DBpedia": ["Event", "Activity"]},
    {"OntoNotes": "FAC", "DBpedia": ["ArchitecturalStructure"]},
    {"OntoNotes": "GPE", "DBpedia": ["Place", "EthnicGroup"]},
    {"OntoNotes": "LANGUAGE", "DBpedia": ["Language"]},
    {"OntoNotes": "LAW", "DBpedia": ["TopicalConcept", "Work"]},
    {"OntoNotes": "LOC", "DBpedia": ["Place", "TopicalConcept"]},
    {"OntoNotes": "MONEY", "DBpedia": ["Currency"]},
    {"OntoNotes": "NORP", "DBpedia": ["EthnicGroup", "Agent"]},
    {"OntoNotes": "ORDINAL", "DBpedia": ["Identifier"]},
    {"OntoNotes": "ORG", "DBpedia": ["Agent", "PersonFunction"]},
    {"OntoNotes": "PERCENT", "DBpedia": ["UnitOfWork"]},
    {"OntoNotes": "PERSON", "DBpedia": ["Agent", "PersonFunction"]},
    {"OntoNotes": "PRODUCT", "DBpedia": ["Device", "Work", "MeanOfTransportation"]},
    {"OntoNotes": "QUANTITY", "DBpedia": ["UnitOfWork", "Identifier"]},
    {"OntoNotes": "TIME", "DBpedia": ["TimePeriod"]},
    {"OntoNotes": "WORK_OF_ART", "DBpedia": ["Work", "Award"]},

    # Including all DBpedia classes explicitly
    {"OntoNotes": "", "DBpedia": "Species"},
    {"OntoNotes": "", "DBpedia": "SportsSeason"},
    {"OntoNotes": "", "DBpedia": "ChemicalSubstance"},
    {"OntoNotes": "", "DBpedia": "Biomolecule"},
    {"OntoNotes": "", "DBpedia": "Disease"},
    {"OntoNotes": "", "DBpedia": "Food"},
    {"OntoNotes": "", "DBpedia": "AnatomicalStructure"},
    {"OntoNotes": "", "DBpedia": "Name"},
    {"OntoNotes": "", "DBpedia": "Colour"},
    {"OntoNotes": "", "DBpedia": "Pandemic"},
    {"OntoNotes": "", "DBpedia": "SportCompetitionResult"},
    {"OntoNotes": "", "DBpedia": "MedicalSpecialty"}
]


In [4]:
DBPEDIA_LOOKUP_ENDPOINT = "https://lookup.dbpedia.org/api/search"

In [5]:
def search_dbpedia(entity_text, dbpedia_type=None, max_results=3):
    """
    Query DBpedia Lookup API to retrieve information about an entity based on DBpedia type.
    """
    params = {
        "query": entity_text,
        "format": "JSON",
        "maxResults": max_results,
    }
    if dbpedia_type:
        params["typeName"] = dbpedia_type
        params["typeNameRequired"] = "true"

    best_result = None
    highest_score = float('-inf')

    try:
        response = requests.get(DBPEDIA_LOOKUP_ENDPOINT, params=params)
        response.raise_for_status()
        data = response.json()

        if data.get('docs'):
            for doc in data['docs']:
                score = float(doc.get('score', [0])[0])
                if score > highest_score:
                    highest_score = score
                    best_result = {
                        "Label": doc.get('label', ['Unknown'])[0].replace('<B>', '').replace('</B>', ''),
                        "URI": doc.get('resource', ['Unknown'])[0],
                        "Description": doc.get('comment', ['No description available'])[0].replace('<B>', '').replace('</B>', ''),
                        "score": highest_score
                    }
    except requests.exceptions.RequestException as e:
        print(f"Error querying DBpedia for type {dbpedia_type}: {e}")

    return best_result if best_result else {"Label": "No match found", "URI": "", "Description": "No description available", "score": 0}


In [6]:
print("The following entities were disambiguated using DBpedia Lookup:")
for span in ner_spans:
    entity_text = span["text"]
    entity_type = span["type"]

    # Find the DBpedia classes mapped to the OntoNotes class
    dbpedia_classes = next(
        (mapping["DBpedia"] for mapping in ontonotes_to_dbpedia_mapping if mapping["OntoNotes"] == entity_type),
        []
    )

    best_result = None
    best_score = -1

    # Query DBpedia for each mapped class
    for dbpedia_class in dbpedia_classes:
        dbpedia_result = search_dbpedia(entity_text, dbpedia_type=dbpedia_class)

        # Extract the score from the result
        if dbpedia_result.get("score"):
            score = dbpedia_result["score"]
            if score > best_score:
                best_score = score
                best_result = dbpedia_result

    # Process the best result
    if best_result and best_result["Label"] != "No match found":
        print(f'"{entity_text}" → {best_result}')
    else:
        print(f'"{entity_text}" → No match found')


The following entities were disambiguated using DBpedia Lookup:
"Notre Dame" → {'Label': 'Notre-Dame de Paris', 'URI': 'http://dbpedia.org/resource/Notre-Dame_de_Paris', 'Description': 'Notre-Dame de Paris (; French: [nɔtʁə dam də paʁi] (); meaning "Our Lady of Paris"), referred to', 'score': 8339.885}
"Paris" → {'Label': 'Paris', 'URI': 'http://dbpedia.org/resource/Paris', 'Description': 'Paris (French pronunciation: \u200b[paʁi] ()) is the capital and most populous city of France, with a', 'score': 59193.582}
"five years" → {'Label': '1905', 'URI': 'http://dbpedia.org/resource/1905', 'Description': '1905 (MCMV) was a common year starting on Sunday of the Gregorian calendar and a common year', 'score': 537.7726}


## Wikidata

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [8]:
WIKIDATA_SEARCH_ENDPOINT = "https://www.wikidata.org/w/api.php"
WIKIDATA_GET_ENTITY_ENDPOINT = "https://www.wikidata.org/w/api.php"

In [9]:
def search_wikidata(entity_text, max_results=3):
    """
    Query Wikidata API to retrieve matching entities based on search text.
    """
    params = {
        "action": "wbsearchentities",
        "search": entity_text,
        "format": "json",
        "language": "en",
        "uselang": "en",
        "limit": max_results,
    }

    try:
        response = requests.get(WIKIDATA_SEARCH_ENDPOINT, params=params)
        response.raise_for_status()
        data = response.json()

        results = []
        if data.get('search'):
            for item in data['search']:
                label = item['label']
                description = item.get('description', 'No description available')
                url = f"https://www.wikidata.org/wiki/{item['id']}"
                entity_id = item['id']

                # Now, we fetch detailed information about the entity to get its type (instance of)
                type_info = get_entity_type(entity_id)

                results.append({
                    "Label": label,
                    "Description": description,
                    "URL": url,
                    "ID": entity_id,
                    "Type": type_info
                })

        return results

    except requests.exceptions.RequestException as e:
        print(f"Error querying Wikidata: {e}")
        return []


In [10]:

def get_entity_type(entity_id):
    """
    Fetches the type of an entity (e.g., Person, Organisation) based on its 'instance of' property.
    """
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "sites": "wikidata",
        "props": "claims",
        "format": "json",
    }

    try:
        response = requests.get(WIKIDATA_GET_ENTITY_ENDPOINT, params=params)
        response.raise_for_status()
        data = response.json()

        # Check for the 'instance of' (P31) claim to determine the type of the entity
        if "entities" in data and entity_id in data["entities"]:
            entity = data["entities"][entity_id]
            claims = entity.get("claims", {})
            if "P31" in claims:
                # 'P31' is the property for "instance of", which typically identifies the entity's type
                entity_type = claims["P31"][0]["mainsnak"]["datavalue"]["value"]["id"]
                # Return the type label from the corresponding Wikidata entity
                type_label = get_entity_label(entity_type)
                return type_label

        return "Unknown"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching entity type for {entity_id}: {e}")
        return "Unknown"


In [11]:
def compute_similarity(entity_text, candidates):
    """
    Compute cosine similarity between the input text and the candidates' label and description.
    """
    documents = [entity_text]  # The entity_text is the query sentence
    for candidate in candidates:
        documents.append(candidate['Label'] + " " + candidate['Description'])

    # Use TF-IDF Vectorizer to convert text to vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity between the input text (first row) and candidates (remaining rows)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Assign similarity score to each candidate
    for idx, candidate in enumerate(candidates):
        candidate['Similarity'] = cosine_similarities[0][idx]

    return sorted(candidates, key=lambda x: x['Similarity'], reverse=True)

In [14]:
def get_entity_label(entity_id):
    """
    Fetch the label of a Wikidata entity based on its ID.
    """
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "format": "json",
        "props": "labels",
        "languages": "en"
    }

    try:
        response = requests.get(WIKIDATA_GET_ENTITY_ENDPOINT, params=params)
        response.raise_for_status()
        data = response.json()

        if "entities" in data and entity_id in data["entities"]:
            return data["entities"][entity_id]["labels"]["en"]["value"]

        return "Unknown"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching label for entity type {entity_id}: {e}")
        return "Unknown"


In [22]:
def get_parent_types(wikidata_id):
    """
    Query Wikidata to fetch the parent classes (hierarchy) of a given entity type.
    """
    params = {
        "action": "wbgetentities",
        "ids": wikidata_id,
        "props": "claims",
        "format": "json"
    }

    try:
        response = requests.get(WIKIDATA_SEARCH_ENDPOINT, params=params)
        response.raise_for_status()
        data = response.json()

        parent_types = []
        entity = data['entities'][wikidata_id]

        # Check if the entity has the 'P31' property (instance of)
        if 'claims' in entity and 'P31' in entity['claims']:
            for claim in entity['claims']['P31']:
                parent_id = claim['mainsnak']['datavalue']['value']['id']
                # Fetch the label for each parent ID to make it human-readable
                parent_label = get_entity_label(parent_id)
                parent_types.append(parent_label)

        # Return parent types as a list of human-readable labels
        return parent_types

    except requests.exceptions.RequestException as e:
        print(f"Error querying Wikidata for parent types of {wikidata_id}: {e}")
        return []


In [23]:
# Process each entity in the NER output
for span in ner_spans:
    entity_text = span["text"]
    print(f"Disambiguating entity: {entity_text}")
    results = search_wikidata(entity_text)

    if results:
        for result in results:
            print(f"Best match for '{entity_text}':")
            print(f"Label: {result['Label']}")
            print(f"Description: {result['Description']}")
            print(f"URL: {result['URL']}")
            print(f"Type: {result['Type']}")
            print(f"Parent types (hierarchy from detailed to general):")

            # Fetch parent types
            wikidata_id = result['ID']  # Wikidata ID from the first result
            parent_types = get_parent_types(wikidata_id)
            # Traverse and print parent types (you can implement a recursive call if necessary to get the full hierarchy)
            while parent_types:
                parent_type_id = parent_types.pop(0)  # Get the next parent type
                print(f"- {parent_type_id}")  # Print parent type ID
            print("\n")
    else:
        print(f"No matches found for '{entity_text}'\n")
    print("----------\n")


Disambiguating entity: Notre Dame
Best match for 'Notre Dame':
Label: Mary
Description: mother of Jesus
URL: https://www.wikidata.org/wiki/Q345
Type: human biblical figure
Parent types (hierarchy from detailed to general):
- human biblical figure
- human


Best match for 'Notre Dame':
Label: University of Notre Dame
Description: Catholic university located in South Bend, Indiana, United States
URL: https://www.wikidata.org/wiki/Q178848
Type: private university
Parent types (hierarchy from detailed to general):
- private university
- private not-for-profit educational institution
- Catholic university
- academic publisher
- open-access publisher


Best match for 'Notre Dame':
Label: Notre-Dame de Paris
Description: cathedral in Paris
URL: https://www.wikidata.org/wiki/Q2981
Type: Catholic cathedral
Parent types (hierarchy from detailed to general):
- Catholic cathedral
- minor basilica
- parish church
- tourist attraction


----------

Disambiguating entity: Paris
Best match for 'Paris'