# Prove per extended type

In [1]:
! pip install SPARQLWrapper



In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import bz2
import json
from collections import Counter
from tqdm import tqdm


In [7]:


def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [13]:
wikidata_dump_path = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/latest-all.json.bz2'
SIZE_PROC = 1000
chunk_size = 1000  # Number of rows per chunk


def process_entity(item):
    try:
        entity = item['id']
        labels = item.get("labels", {})
        english_label = labels.get("en", {}).get("value", "")
        description = item.get('descriptions', {}).get('en', {})
        NERtype = None

        if item.get("type") == "item" and "claims" in item:
            p31_claims = item["claims"].get("P279", [])

            types_list = []

            for claim in p31_claims:
                mainsnak = claim.get("mainsnak", {})
                datavalue = mainsnak.get("datavalue", {})
                numeric_id = datavalue.get("value", {}).get("numeric-id")
                types_list.append("Q"+str(numeric_id))

            return [entity, english_label, types_list]
        

    except json.decoder.JSONDecodeError:
        pass

# Initial setup for data processing
counter = 0

# Process data and print relevant details
try:
    with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
        pbar = tqdm(total=SIZE_PROC)
        
        for line in f:
            try:
                item = json.loads(line[:-2])
                print(item)
                entity, english_label, types_list = process_entity(item)

                print(entity)
                print(types_list)
                for el in types_list:
                    superclasses = retrieve_superclasses(el)  # Replace with your entity ID
                    
                    print(f"[{el}] - Number of Superclasses: {len(superclasses)}")

                print("_______________________")

            
            except json.decoder.JSONDecodeError:
                continue

            if counter == SIZE_PROC:
                break
            counter += 1
        pbar.close()

except Exception as e:
    print(f"An error occurred: {e}")

  0%|          | 0/1000 [00:49<?, ?it/s]


{'type': 'item', 'id': 'Q31', 'labels': {'el': {'language': 'el', 'value': 'Βέλγιο'}, 'ay': {'language': 'ay', 'value': 'Bilkiya'}, 'pnb': {'language': 'pnb', 'value': 'بیلجیئم'}, 'na': {'language': 'na', 'value': 'Berdjiyum'}, 'mk': {'language': 'mk', 'value': 'Белгија'}, 'bn': {'language': 'bn', 'value': 'বেলজিয়াম'}, 'bpy': {'language': 'bpy', 'value': 'বেলজিয়াম'}, 'lt': {'language': 'lt', 'value': 'Belgija'}, 'jam': {'language': 'jam', 'value': 'Beljiom'}, 'sk': {'language': 'sk', 'value': 'Belgicko'}, 'so': {'language': 'so', 'value': 'Beljim'}, 'tl': {'language': 'tl', 'value': 'Belgium'}, 'uk': {'language': 'uk', 'value': 'Бельгія'}, 'tt': {'language': 'tt', 'value': 'Бельгия'}, 'sc': {'language': 'sc', 'value': 'Bèlgiu'}, 'bxr': {'language': 'bxr', 'value': 'Бельги'}, 'ff': {'language': 'ff', 'value': 'Beljik'}, 'za': {'language': 'za', 'value': 'Bijliswz'}, 'yo': {'language': 'yo', 'value': 'Bẹ́ljíọ̀m'}, 'ext': {'language': 'ext', 'value': 'Bélgica'}, 'zea': {'language': 'zea

KeyboardInterrupt: 

In [None]:
# Example usage
superclasses = retrieve_superclasses("Q1320047")  # Replace with your entity ID
print("Number of Superclasses:", len(superclasses))
print("Superclass Dictionary:")
print(superclasses)


# Esperimenti fatti con Riccardo

In [None]:
import bz2
import json
from tqdm import tqdm
import traceback
import os
from pymongo import MongoClient
from pymongo import *
from pymongo import errors
import configparser
from json.decoder import JSONDecodeError
from requests import get

In [None]:
# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
DB_NAME = f"wikidata"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
print(client)

log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []


In [None]:
def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.
    --------------------------------------------
    For example, if you have an item with types A, B, and C, and you specify a forward property that applies to type B, the item will 
    be included in the result because it has type B, even if it also has types A and C
    --------------------------------------------  
    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids

In [None]:
# example with "capital city"

list = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
for el in list:
    data = {
        "json": [
            "Q"+(str(el))
        ]
    }
    response = requests.post(url, headers=headers, json=data)

    try:
        result = response.json()
        label = result["Q"+(str(el))]['labels']['en']
        print(label)  # Print the label or ID with indentation
    except:
        pass

In [None]:
# Function to fetch the necessary subclass sets with individual try-except blocks
def fetch_wikidata_subclasses():
    try:
        organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        organization_subclass = []
    
    try:
        country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        country_subclass = []
    
    try:
        city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        city_subclass = []
    
    try:
        capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        capitals_subclass = []
    
    try:
        admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        admTerr_subclass = []
    
    try:
        family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        family_subclass = []
    
    try:
        sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        sportLeague_subclass = []
    
    try:
        venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        venue_subclass = []
    
    # Removing overlaps for organization_subclass
    organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))
    
    try:
        geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        geolocation_subclass = []
    
    try:
        food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        food_subclass = []
    
    try:
        edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        edInst_subclass = []
    
    try:
        govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        govAgency_subclass = []
    
    try:
        intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        intOrg_subclass = []
    
    try:
        timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        timeZone_subclass = []

    # Removing overlaps for geolocation_subclass
    geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))
    
    return organization_subclass, geolocation_subclass


In [None]:

with open("./organization_subclass.txt", "w") as file:
    for item in organization_subclass:
        file.write(f"{item}\n")  # Write each item on a new line

In [None]:
from collections import Counter


wikidata_dump_path = './data/latest-all.json.bz2'
SIZE_PROC = 1000
chunk_size = 1000  # Number of rows per chunk

organization_subclass, geolocation_subclass = fetch_wikidata_subclasses()

def process_entity(item):
    try:
        entity = item['id']
        labels = item.get("labels", {})
        english_label = labels.get("en", {}).get("value", "")
        description = item.get('descriptions', {}).get('en', {})
        NERtype = None

        if item.get("type") == "item" and "claims" in item:
            p31_claims = item["claims"].get("P31", [])

            # Initialize a counter to track occurrences of NER types
            ner_counter = Counter()

            for claim in p31_claims:
                mainsnak = claim.get("mainsnak", {})
                datavalue = mainsnak.get("datavalue", {})
                numeric_id = datavalue.get("value", {}).get("numeric-id")

                # Classify NER types
                if numeric_id == 5:
                    ner_counter['PERS'] += 1
                elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                    ner_counter['LOC'] += 1
                elif numeric_id in organization_subclass:
                    ner_counter['ORG'] += 1
                else:
                    ner_counter['OTHERS'] += 1
                    
            # Get the most common NER type
            if ner_counter:
                NERtype, _ = ner_counter.most_common(1)[0]  # Get the most common type

            # Print label, ID, type, and NER classification
            print(f"{english_label} - {entity}: (NER type: {NERtype})")

    except json.decoder.JSONDecodeError:
        pass

# Initial setup for data processing
counter = 0

# Process data and print relevant details
try:
    with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
        pbar = tqdm(total=SIZE_PROC)
        
        for line in f:
            try:
                item = json.loads(line[:-2])
                process_entity(item)
            except json.decoder.JSONDecodeError:
                continue

            if counter == SIZE_PROC:
                break
            counter += 1
        pbar.close()

except Exception as e:
    print(f"An error occurred: {e}")

# Test query chiusura transitiva

In [None]:
! pip install SPARQLWrapper

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_subclasses(Qid):
    # Initialize the SPARQL endpoint (Wikidata in this case)
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Define the SPARQL query with the provided QID
    query = f"""
    SELECT DISTINCT ?item ?desc WHERE {{
      wd:{Qid} wdt:P279* ?item.
      ?item rdfs:label ?desc FILTER (lang(?desc) = "en").
    }}
    """
    
    # Set the query and the return format (JSON)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and get results
    results = sparql.query().convert()
    
    # Parse the results
    subclasses = []
    for result in results["results"]["bindings"]:
        item = result["item"]["value"]
        desc = result["desc"]["value"]
        subclasses.append({"item": item, "description": desc})
    
    return subclasses

# Example usage
Qid = "Q64027599" 
subclasses = get_subclasses(Qid)

for subclass in subclasses:
    print(f"Item: {subclass['item']}, Description: {subclass['description']}")


In [None]:
query_data = {
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "name": {
                            "query": "Belgium",
                            "boost": 2.0
                        }
                    }
                }
            ],
            "should": [
                {"term": {"type": "realm"}},
                {"term": {"type": "soverign state"}},
                {"term": {"type": "country"}}
            ],
            "minimum_should_match": 1
        }
    }
}

In [None]:
import requests
import json

# Define the URL
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Define the query data (decoded for readability)
query_data = {
  "query": {
    "bool": {
      "must": [
        {
          "query_string": {
            "default_field":"types",  "query": "Q6881511"
          }
        },
        {
          "match": {
            "name": {"query":"sinopec","boost":2.0}
          }
        }
      ]
    }
  }
}

# Define the parameters and token
params = {
    'name': 'sinopec',
    #'query': json.dumps(query_data),  # JSON encoded query data
    'token': 'lamapi_demo_2023'
}

# Send the GET request
response = requests.get(url, params=params, headers={'accept': 'application/json'})

# Print the response
if response.status_code == 200:
    res = response.json()
    for el in res:
        print(f"{el['name']} ({el['id']}) with type:")
        for type in el['types']:
            print(f"                {type['name']}")  # Assuming the response is JSON formatted
else:
    print(f"Request failed with status code {response.status_code}")
