In [38]:
# Imports
import requests

In [39]:
# Api notebook

API_URL = "https://www.wikidata.org/w/api.php"


def get_entity_data(entity_id, languages='en') -> dict:
    """collect relevant information on an entity such as sitelinks, statements and description

    Args:
        entity_id (string): id of an entity
        languages (str, optional)

    Returns:
        dict: returns a detailed dictionairy of a given entity
    """

    response = requests.get(API_URL, params= { "action": "wbgetentities", "ids": entity_id, "format": "json", "languages": languages })
    
    if response.status_code == 200:
        data = response.json()

        if "entities" in data and entity_id in data["entities"]:
            entity = data.get('entities').get(entity_id)

            # Get label and description of entity
            label = entity['labels']['en']['value']
            description = entity['descriptions']['en']['value']


            # Get the number of linked sites
            num_sitelinks =  len(entity['sitelinks'].keys())

            # Wikepedia url

            # Get the sites linked to this entity
            wikipedia_url = entity['sitelinks'].get('enwiki')

            if wikipedia_url:
                url = 'https://en.wikipedia.org/wiki/' + wikipedia_url['title'].replace(' ', '_')
            else:
                url = ''

            # Claims
            
            claims_data = {}

            # Extract property values and labels
            for key, claim_list in entity['claims'].items():
                values = []

                for claim in claim_list:
                    val = claim.get('mainsnak', {}).get('datavalue', {}).get('value', {})
                    values.append(val)

                claims_data[key] = values


            return {'label': label, 'description': description, 'claims': claims_data, 'sitelinks': num_sitelinks, 'url': url}
        
    return {}



def get_entities(entity, language="en", limit=5) -> list:
    """get a list of candidates for a specific entity

    Args:
        entity (string): name of the entity you want to search
        language (str, optional): Defaults to "en".
        limit (int, optional): Defaults to 5.

    Returns:
        list: list of IDs found for this entity
    """

    response = requests.get(API_URL, params={ "action": "wbsearchentities", "search": entity, "language": language, "format": "json", "limit": limit, })

    if response.status_code == 200:
        data = response.json()
        
        candidates = []

        for entity in data.get("search", []):
            candidates.append(entity["id"])

        return candidates
    
    return {}

In [40]:
def decypher_label(entity_id):
    response = requests.get(API_URL, params= { "action": "wbgetentities", "ids": entity_id, "format": "json", "languages": 'en' })

    if response.status_code == 200:
        data = response.json()
        
        if "entities" in data and entity_id in data["entities"]:
            entity = data.get('entities').get(entity_id)

            return entity['labels']['en']['value']
        
    return ''

def get_statements(entity):

    claims = {}

    for property_id, property_values in entity['claims'].items():
        
        property_label = decypher_label(property_id)

        values = []
        
        for value in property_values:
            
            # If the value is an entity, resolve its label
            if isinstance(value, dict) and "id" in value:
                value_label = decypher_label(value["id"])
                values.append(value_label)
            else:
                values.append(value)
        
        # Get human-readable label for property ID
        claims[property_label] = values

        break
    
    return claims


In [41]:
query = 'homer'

candidates = get_entities(query)

for candidate in candidates[2:]:
    y = get_entity_data(candidate)
    print(get_statements(y))
    break

{'occupation': ['poet', 'author', 'writer']}


In [None]:
import requests
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Wikidata API endpoint
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"

def search_wikidata(entity):
    """Search for an entity in Wikidata and return its Wikidata ID."""
    params = {
        "action": "wbsearchentities",
        "search": entity,
        "language": "en",
        "format": "json",
    }
    response = requests.get(WIKIDATA_API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if "search" in data and len(data["search"]) > 0:
            return data["search"][0]["id"]  # Return the first result's ID
    return None

def get_relation_from_wikidata(subject_id, relation_label):
    """Retrieve objects linked to a subject by a relation in Wikidata."""
    params = {
        "action": "wbgetentities",
        "ids": subject_id,
        "format": "json",
    }
    response = requests.get(WIKIDATA_API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if "entities" in data and subject_id in data["entities"]:
            claims = data["entities"][subject_id]["claims"]
            results = []
            for property_id, statements in claims.items():
                # Match the relation by label
                if relation_label.lower() in get_label(property_id).lower():
                    for statement in statements:
                        mainsnak = statement.get("mainsnak", {})
                        datavalue = mainsnak.get("datavalue", {})
                        if datavalue.get("type") == "wikibase-entityid":
                            object_id = datavalue["value"]["id"]
                            results.append(get_label(object_id))
            return results
    return []

def get_label(entity_id):
    """Fetch the label of a Wikidata entity."""
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "format": "json",
        "languages": "en",
    }
    response = requests.get(WIKIDATA_API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if "entities" in data and entity_id in data["entities"]:
            return data["entities"][entity_id].get("labels", {}).get("en", {}).get("value", entity_id)
    return entity_id  # Fallback to entity ID if label not found

def process_question(question):
    """Extract entities, relations, and verify with Wikidata."""
    doc = nlp(question)
    entities = [ent.text for ent in doc.ents]
    relations = []
    
    for token in doc:
        if token.dep_ in ("ROOT", "attr", "prep") and token.head.pos_ in ("VERB", "AUX"):
            # Extract relations
            subject = [child for child in token.head.lefts if child.dep_ == "nsubj"]
            object_ = [child for child in token.head.rights if child.dep_ in ("pobj", "dobj")]
            if subject and object_:
                relations.append((subject[0].text, token.text, object_[0].text))

    print([ent.text for ent in doc.ents])
    # # Link entities to Wikidata and verify relations
    # for subject, relation, object_ in relations:
    #     subject_id = search_wikidata(subject)
    #     if subject_id:
    #         objects_from_kg = get_relation_from_wikidata(subject_id, relation)
    #         if object_ in objects_from_kg:
    #             print(f"Relation Verified: {subject} {relation} {object_}")
    #         else:
    #             print(f"Relation Not Verified: {subject} {relation} {object_}")
    #     else:
    #         print(f"Entity not found in Wikidata: {subject}")

# Example questions
questions = [
    "Is it correct that Berlin is the capital of Germany?",
    "Does Japan have the highest life expectancy in the world?",
]

# Process each question
for question in questions:
    print(f"Processing Question: {question}")
    process_question(question)
    print()

Processing Question: Is it correct that Berlin is the capital of Germany?
[]

Processing Question: Does Japan have the highest life expectancy in the world?
[('Japan', 'have', 'expectancy')]



In [43]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def extract_entities_and_relations(sentence):
    """Extract entities and relations using dependency parsing."""
    doc = nlp(sentence)
    entities = []
    relations = []

    # Extract entities using NER
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))

    # Dependency-based relation extraction
    for token in doc:
        # Identify root verbs and their subjects/objects
        if token.pos_ in ("VERB", "AUX"):
            subject = [child for child in token.children if child.dep_ in ("nsubj", "nsubjpass")]
            objects = [child for child in token.children if child.dep_ in ("dobj", "pobj", "attr", "acomp")]

            if subject and objects:
                for subj in subject:
                    for obj in objects:
                        relations.append((subj.text, token.text, obj.text))

        # Extract prepositional relations (e.g., "capital of Germany")
        if token.dep_ == "prep" and token.head.pos_ in ("NOUN", "PROPN"):
            obj = [child for child in token.children if child.dep_ == "pobj"]
            if obj:
                relations.append((token.head.text, token.text, obj[0].text))

    return entities, relations

# Example questions
questions = [
    "Is it correct that Berlin is the capital of Germany?",
    "Does Japan have the highest life expectancy in the world?",
    "True or False: Brazil’s official language is Portuguese.",
    "Would it be accurate to say the Eiffel Tower is in Paris?",
    "Does Australia function as both a country and a continent?",
]

# Process each question
for question in questions:
    print(f"Question: {question}")
    entities, relations = extract_entities_and_relations(question)
    print(f"Entities: {entities}")
    print(f"Relations: {relations}")
    print()

Question: Is it correct that Berlin is the capital of Germany?
Entities: [('Berlin', 'GPE'), ('Germany', 'GPE')]
Relations: [('it', 'Is', 'correct'), ('Berlin', 'is', 'capital'), ('capital', 'of', 'Germany')]

Question: Does Japan have the highest life expectancy in the world?
Entities: [('Japan', 'GPE')]
Relations: [('Japan', 'have', 'expectancy'), ('expectancy', 'in', 'world')]

Question: True or False: Brazil’s official language is Portuguese.
Entities: [('Brazil', 'GPE'), ('Portuguese', 'NORP')]
Relations: [('language', 'is', 'Portuguese')]

Question: Would it be accurate to say the Eiffel Tower is in Paris?
Entities: [('the Eiffel Tower', 'LOC'), ('Paris', 'GPE')]
Relations: [('it', 'be', 'accurate')]

Question: Does Australia function as both a country and a continent?
Entities: [('Australia', 'GPE')]
Relations: []



In [44]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# List of questions
questions = [
    "Is it correct that Berlin is the capital of Germany?",
    "Does Japan have the highest life expectancy in the world?",
    "True or False: Brazil's official language is Portuguese.",
    "Would it be accurate to say the Eiffel Tower is in Paris?",
    "Does Australia function as both a country and a continent?"
]


def perform_ner(text):
    """
    Perform Named Entity Recognition on the input text
    
    Args:
        text (str): Input text to analyze
    
    Returns:
        list: List of extracted named entities with their types
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def extract_relations(text, entities):
    """
    Extract potential relations between entities
    
    Args:
        text (str): Input text
        entities (list): List of named entities
    
    Returns:
        list: List of potential relations
    """
    relations = []
    
    # Simple relation extraction based on dependencies
    doc = nlp(text)
    for token in doc:
        if token.dep_ in ['nsubj', 'attr', 'ROOT']:
            for ent in entities:
                # if ent[0] in token.subtree.text:
                    
                    print(ent, token.lemma_, token.head.text)
    
    return relations

# Process each question
results = []

for question in questions:
    # Perform NER
    entities = perform_ner(question)
    
    # Extract relations
    relations = extract_relations(question, entities)
    


# Display results
for result in results:
    print("\nQuestion:", result['question'])
    print("Entities:", result['entities'])
    print("Relations:", result['relations'])

('Berlin', 'GPE') be Is
('Germany', 'GPE') be Is
('Berlin', 'GPE') it Is
('Germany', 'GPE') it Is
('Berlin', 'GPE') Berlin is
('Germany', 'GPE') Berlin is
('Berlin', 'GPE') capital is
('Germany', 'GPE') capital is
('Japan', 'GPE') Japan have
('Japan', 'GPE') have have
('Brazil', 'GPE') language is
('Portuguese', 'NORP') language is
('Brazil', 'GPE') be is
('Portuguese', 'NORP') be is
('the Eiffel Tower', 'LOC') it be
('Paris', 'GPE') it be
('the Eiffel Tower', 'LOC') be be
('Paris', 'GPE') be be
('the Eiffel Tower', 'LOC') Tower is
('Paris', 'GPE') Tower is
('Australia', 'GPE') Australia function
('Australia', 'GPE') function function


In [47]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Merge noun phrases and entities for easier analysis
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Is it correct that Berlin is the capital of Germany?",
    "Does Japan have the highest life expectancy in the world?",
    "True or False: Brazil's official language is Portuguese.",
    "Would it be accurate to say the Eiffel Tower is in Paris?",
    "Does Australia function as both a country and a continent?"
]

# Process the texts
for doc in nlp.pipe(TEXTS):
    for token in doc:
        # Checking for subjects and objects for general relations
        if token.dep_ in ("nsubj", "nsubjpass"):  # Subject
            # Look for the verb (head of subject)
            verb = token.head
            # Find the object (direct or prepositional)
            objects = [child for child in verb.children if child.dep_ in ("dobj", "pobj", "attr")]

            # If we found an object, print the subject, verb, and object
            if objects:
                for obj in objects:
                    print(f"{token.text} --> {verb.text} --> {obj.text}")

        # Check for prepositional phrases (e.g., 'capital of Germany')
        if token.dep_ == "prep":
            # Get the head of the preposition (the noun that the prep modifies)
            head = token.head
            # Get the object of the preposition
            pobj = [child for child in token.children if child.dep_ == "pobj"]
            if pobj:
                for obj in pobj:
                    print(f"{head.text} --> {token.text} --> {obj.text}")

        # Check for entities of type MONEY or other types that are relevant
        if token.ent_type_ in ("MONEY", "GPE", "ORG", "LOC"):
            # Direct object or subject relations
            if token.dep_ in ("attr", "dobj", "pobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)

Berlin --> is --> the capital
the capital --> of --> Germany
Japan --> have --> the highest life expectancy
the highest life expectancy --> in --> the world
is --> in --> Paris
function --> as --> both a country
