In [11]:
import bz2
import json
from tqdm import tqdm
import traceback
import pandas as pd
import os
import sys
from pymongo import MongoClient
from json.decoder import JSONDecodeError
from requests import get

In [12]:
# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
DB_NAME = f"wikidata"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
print(client)

log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True)


In [13]:
def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids

In [14]:
json_file_path = "./data/def_mapping.json"

try:
    # Open the JSON file for reading
    with open(json_file_path, 'r') as json_file:
        mapping = json.load(json_file)
        
except FileNotFoundError:
    print(f"Error: File '{json_file_path}' not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON data: {e}")
except Exception as e:
    print(f"Error loading data from JSON file: {e}")

In [15]:
total_size_processed = 0
num_entities_processed = 0

def update_average_size(new_size):
    global total_size_processed, num_entities_processed
    total_size_processed += new_size
    num_entities_processed += 1
    return total_size_processed / num_entities_processed



In [None]:
################################
###  WITH MULTIPLE CLUSTERING
################################

# Constants
wikidata_dump_path = './data/latest-all.json.bz2'
csv_output_path = './data/multiple_cluster.csv'

initial_estimated_average_size = 800
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip

def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate

try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = set()
    pass

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = set()
    pass

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = set()
    pass

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = set()
    pass

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = set()
    pass

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = set()
    pass

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = set()
    pass
    
try:
    organization_subclass = list(set(organization_subclass))  
    organization_subclass = list(set(organization_subclass)-set(country_subclass)-set(city_subclass)-set(capitals_subclass))
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = set()
    pass

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = set()
    pass

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = set()
    pass

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = set()
    pass

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = set()
    pass
   
try:
    geolocation_subclass = list(set(geolocation_subclass)-set(food_subclass))  
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass



chunk_size = 1000  # Number of rows per chunk

# Initial setup for data processing
data = []
counter = 0
buffer_size = 0

# Function to process and save data in chunks
def append_to_csv(data_chunk, file_path, header=False):
    df = pd.DataFrame(data_chunk, columns=["English Label", "Description", "ID", "NER Type"])
    df.to_csv(file_path, mode='a', header=header, index=False)

try:
    # Processing the data
    with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
        pbar = tqdm(total=initial_total_lines_estimate)
        chunk_data = []
        header_written = False

        for i, line in enumerate(f):
            try:
                # Parse JSON data from each line
                item = json.loads(line[:-2])

                entity = item['id']
                labels = item.get("labels", {})
                english_label = labels.get("en", {}).get("value", "")
                aliases = item.get("aliases", {})
                description = item.get('descriptions', {}).get('en', {})
                category = "entity"
                sitelinks = item.get("sitelinks", {})
                popularity = len(sitelinks) if len(sitelinks) > 0 else 1

                if entity in list(mapping.values()):
                    all_labels = {lang: labels[lang]["value"] for lang in labels}
                    all_aliases = {lang: list(set([alias["value"] for alias in aliases[lang]])) for lang in aliases}

                    found = any(predicate == "P279" for predicate in item["claims"])

                    if found:
                        category = "type"
                    if entity[0] == "P":
                        category = "predicate"

                    line_size = len(line)
                    current_average_size = update_average_size(line_size)
                    pbar.total = round(compressed_file_size / current_average_size)
                    pbar.update(1)

                    NERtype = None
                    if item.get("type") == "item" and "claims" in item:
                        p31_claims = item["claims"].get("P31", [])
                        for claim in p31_claims:
                            mainsnak = claim.get("mainsnak", {})
                            datavalue = mainsnak.get("datavalue", {})
                            numeric_id = datavalue.get("value", {}).get("numeric-id")


                            if numeric_id == 5:
                                NERtype = "PERS"
                            elif numeric_id in organization_subclass and numeric_id in geolocation_subclass and all(
                                    k.lower() not in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                                NERtype = "LOC/ORG"
                            elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                                NERtype = "LOC"
                            elif numeric_id in organization_subclass:
                                NERtype = "ORG"
                            else:
                                NERtype = "OTHERS"
    
                        chunk_data.append((english_label, description.get('value', '').lower(), entity, NERtype))

                    # Save the chunk if it reaches the chunk size
                    if len(chunk_data) >= chunk_size:
                        append_to_csv(chunk_data, csv_output_path, header=not header_written)
                        chunk_data = []
                        header_written = True

            except json.decoder.JSONDecodeError:
                continue

            if counter == 80000:
                break
            counter += 1
        pbar.close()

        # Save any remaining data in the buffer
        if chunk_data:
            append_to_csv(chunk_data, csv_output_path, header=not header_written)

except Exception as e:
    print(f"An error occurred: {e}")



  0%|          | 1000/1261723 [1:28:29<1859:32:48,  5.31s/it]

  0%|          | 1/134136 [00:05<217:58:35,  5.85s/it][A
  0%|          | 2/235911 [00:06<165:27:58,  2.53s/it][A
  0%|          | 3/254513 [00:06<102:58:51,  1.46s/it][A
  0%|          | 4/328766 [00:06<85:50:46,  1.06it/s] [A
  0%|          | 5/332984 [00:06<60:30:56,  1.53it/s][A
  0%|          | 6/372565 [00:06<49:27:51,  2.09it/s][A
  0%|          | 7/377671 [00:06<40:41:16,  2.58it/s][A
  0%|          | 8/335146 [00:07<32:35:34,  2.86it/s][A
  0%|          | 9/352793 [00:07<28:26:43,  3.45it/s][A
  0%|          | 10/371709 [00:07<37:44:30,  2.74it/s][A
  0%|          | 11/404996 [00:08<42:07:47,  2.67it/s][A
  0%|          | 12/411981 [00:08<43:12:57,  2.65it/s][A
  0%|          | 13/334713 [00:09<40:41:56,  2.28it/s][A
  0%|          | 14/300824 [00:09<30:08:32,  2.77it/s][A
  0%|          | 15/317981 [00:09<25:54:51,  3.41it/s][A
  0%|          | 16/334244 [00:09<24:18:44,  3.82it/s][A
  0%|        

In [8]:
################################
###  old version WITH MULTIPLE CLUSTERING
################################

wikidata_dump_path = './data/latest-all.json.bz2'
initial_estimated_average_size = 800
BATCH_SIZE = 1000 # Number of entities to insert in a single batch
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip



def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate




try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = set()
    pass

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = set()
    pass

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = set()
    pass


try:
    oecd_country_subclass =  get_wikidata_item_tree_item_idsSPARQL([113489728], backward_properties=[279])
except json.decoder.JSONDecodeError:
    oecd_country_subclass = set()
    pass

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = set()
    pass

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = set()
    pass

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = set()
    pass

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = set()
    pass
    
try:
    organization_subclass = list(set(organization_subclass))  
    organization_subclass = list(set(organization_subclass)-set(country_subclass)-set(city_subclass)-set(capitals_subclass))
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = set()
    pass

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = set()
    pass

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = set()
    pass

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = set()
    pass

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = set()
    pass
   
try:
    geolocation_subclass = list(set(geolocation_subclass)-set(food_subclass))  
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

counter = 0


# Initialize dictionaries to store final items
final_items = {}
final_objects = {}
final_literals = {}
final_types = {}
data = []

# Loop to process the JSON lines
with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(f):
        try:
            # Parse JSON data from each line
            item = json.loads(line[:-2])

            entity = item['id']
            labels = item.get("labels", {})
            english_label = labels.get("en", {}).get("value", "")
            aliases = item.get("aliases", {})
            description = item.get('descriptions', {}).get('en', {})
            category = "entity"
            sitelinks = item.get("sitelinks", {})
            popularity = len(sitelinks) if len(sitelinks) > 0 else 1

            if entity in list(mapping.values()):
                all_labels = {lang: labels[lang]["value"] for lang in labels}
                all_aliases = {lang: list(set([alias["value"] for alias in aliases[lang]])) for lang in aliases}

                found = any(predicate == "P279" for predicate in item["claims"])

                if found:
                    category = "type"
                if entity[0] == "P":
                    category = "predicate"

                line_size = len(line)
                current_average_size = update_average_size(line_size)
                pbar.total = round(compressed_file_size / current_average_size)
                pbar.update(1)

                NERtype = None
                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])
                    for claim in p31_claims:
                        mainsnak = claim.get("mainsnak", {})
                        datavalue = mainsnak.get("datavalue", {})
                        numeric_id = datavalue.get("value", {}).get("numeric-id")

                        if numeric_id == 5:
                            NERtype = "PERS"
                        elif numeric_id in organization_subclass and numeric_id in geolocation_subclass and all(
                                k.lower() not in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                            NERtype = "LOC/ORG"
                        elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                            NERtype = "LOC"
                        elif numeric_id in organization_subclass:
                            NERtype = "ORG"
                        else:
                            NERtype = "OTHERS"

                #print(f"{english_label} -  {description} --> {NERtype}")
                data.append((english_label, description.get('value', '').lower(), entity, NERtype))

                lang = labels.get("en", {}).get("language", "")
                tmp = {"WD_id": item['id'], "WP_id": labels.get("en", {}).get("value", "")}

                url_dict = {
                    "wikidata": "http://www.wikidata.org/wiki/" + tmp["WD_id"],
                    "wikipedia": "http://" + lang + ".wikipedia.org/wiki/" + tmp["WP_id"].replace(" ", "_"),
                    "dbpedia": "http://dbpedia.org/resource/" + tmp["WP_id"].capitalize().replace(" ", "_")
                }

                objects = {}
                literals = {datatype: {} for datatype in DATATYPES}
                types = {"P31": []}

                join = {
                    "items": {
                        "id_entity": i,
                        "entity": entity,
                        "description": description,
                        "labels": all_labels,
                        "aliases": all_aliases,
                        "types": types,
                        "popularity": popularity,
                        "kind": category,
                        "NERtype": NERtype,
                        "URLs": url_dict
                    },
                    "objects": {
                        "id_entity": i,
                        "entity": entity,
                        "objects": objects
                    },
                    "literals": {
                        "id_entity": i,
                        "entity": entity,
                        "literals": literals
                    },
                    "types": {
                        "id_entity": i,
                        "entity": entity,
                        "types": types
                    },
                }

                predicates = item["claims"]
                for predicate in predicates:
                    for obj in predicates[predicate]:
                        datatype = obj["mainsnak"]["datatype"]

                        if check_skip(obj, datatype):
                            continue

                        if datatype == "wikibase-item" or datatype == "wikibase-property":
                            value = obj["mainsnak"]["datavalue"]["value"]["id"]

                            if predicate == "P31" or predicate == "P106":
                                types["P31"].append(value)

                            if value not in objects:
                                objects[value] = []
                            objects[value].append(predicate)
                        else:
                            value = get_value(obj, datatype)
                            lit = literals[DATATYPES_MAPPINGS[datatype]]

                            if predicate not in lit:
                                lit[predicate] = []
                            lit[predicate].append(value)

                # Store the processed item in the corresponding dictionary
                final_items[i] = join["items"]
                final_objects[i] = join["objects"]
                final_literals[i] = join["literals"]
                final_types[i] = join["types"]

        except json.decoder.JSONDecodeError:
            continue

        if counter == 30000:
            break
        counter += 1
    pbar.close()

# After processing, final_items, final_objects, final_literals, and final_types contain the data


  0%|          | 138/1197615 [00:55<132:46:56,  2.51it/s]
  0%|          | 1723/1273810 [08:40<87:30:50,  4.04it/s] 

KeyboardInterrupt: 

In [None]:
len(data)

In [9]:
import pandas as pd

df = pd.DataFrame(data, columns=["English Label", "Description", "ID", "NER Type"])
df[df['NER Type']== "LOC/ORG"].head()

Unnamed: 0,English Label,Description,ID,NER Type
30,Gmina Kurów,polish rural gmina in lublin voivodeship,Q433,LOC/ORG
32,Rhône-Alpes,former administrative region of france,Q463,LOC/ORG
34,Museum of Fine Arts of Lyon,"art museum in lyon, france",Q511,LOC/ORG
53,South Holland,province of the netherlands,Q694,LOC/ORG
83,Reggiolo,"town in the province of reggio emilia, emilia-...",Q952,LOC/ORG


In [None]:
from collections import Counter
#### filtering con SPARQL
Counter(df['NER Type'])

In [10]:
df.to_csv('./data/multiple_cluster.csv', index=False)


In [None]:
from collections import Counter
#### no filtering con SPARQL
Counter(df['NER Type'])

In [None]:
################################
###  WITH SINGLE CLUSTERING
################################

wikidata_dump_path = './my-data/latest-all.json.bz2'
initial_estimated_average_size = 800
BATCH_SIZE = 100 # Number of entities to insert in a single batch
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip


def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate

try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = set()
    pass

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = set()
    pass

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = set()
    pass

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = set()
    pass

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = set()
    pass

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = set()
    pass

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = set()
    pass
    
try:
    organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = set()
    pass

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = set()
    pass

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = set()
    pass

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = set()
    pass

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = set()
    pass
   
try:
    geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 1000
    
    ORG = []
    PERS = []
    LOC = []
    OTHERS = []

    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(f):
        try:
            # Parse JSON data from each line
            item = json.loads(line[:-2])

            entity = item['id']
            labels = item.get("labels", {})
            english_label = labels.get("en", {}).get("value", "")
            aliases = item.get("aliases", {})
            description = item.get('descriptions', {}).get('en', {})
            category = "entity"
            sitelinks = item.get("sitelinks", {})
            popularity = len(sitelinks) if len(sitelinks) > 0 else 1

            
            if entity in list(mapping.values()):
                all_labels = {}
                for lang in labels:
                    all_labels[lang] = labels[lang]["value"]
            
                all_aliases = {}
                for lang in aliases:
                    all_aliases[lang] = []
                    for alias in aliases[lang]:
                        all_aliases[lang].append(alias["value"])
                    all_aliases[lang] = list(set(all_aliases[lang]))
            
                found = False
                for predicate in item["claims"]:
                    if predicate == "P279":
                        found = True
            
                if found:
                    category = "type"
                if entity[0] == "P":
                    category = "predicate"
        
                line_size = len(line)
                current_average_size = update_average_size(line_size)
                pbar.total = round(compressed_file_size / current_average_size)
                pbar.update(1)
    
                ###############################################################
                # ORGANIZATION EXTRACTION
                # All items with the root class Organization (Q43229) excluding country (Q6256), city (Q515), capitals (Q5119), 
                # administrative territorial entity of a single country (Q15916867), venue (Q17350442), sports league (Q623109) 
                # and family (Q8436)
                
                # LOCATION EXTRACTION
                # All items with the root class Geographic Location (Q2221906) excluding: food (Q2095), educational institution (Q2385804), 
                # government agency (Q327333), international organization (Q484652) and time zone (Q12143)
                
                # PERSON EXTRACTION
                # All items with the statement is instance of (P31) human (Q5) are classiﬁed as person.
    
                NERtype = None
    
                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])
                    
                    if len(p31_claims) != 0:           
                        for claim in p31_claims:
                            mainsnak = claim.get("mainsnak", {})
                            datavalue = mainsnak.get("datavalue", {})
                            numeric_id = datavalue.get("value", {}).get("numeric-id")
                            
                            if numeric_id == 5:
                                NERtype = "PERS" 
                            elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower() for k in ["district", "city", "country", "capital"]):
                                NERtype = "LOC"
                            elif numeric_id in organization_subclass:
                                NERtype = "ORG"  
                            else:
                                NERtype = "OTHERS"
                    else:
                        NERtype = "OTHERS" 
                        
                ################################################################   
                ################################################################   
                # URL EXTRACTION
            
                try:
                    lang = labels.get("en", {}).get("language", "")
                    tmp={}
                    tmp["WD_id"] = item['id']
                    tmp["WP_id"] = labels.get("en", {}).get("value", "")
            
                    url_dict={}
                    url_dict["wikidata"] = "http://www.wikidata.org/wiki/"+tmp["WD_id"]
                    url_dict["wikipedia"] = "http://"+lang+".wikipedia.org/wiki/"+tmp["WP_id"].replace(" ","_")
                    url_dict["dbpedia"] = "http://dbpedia.org/resource/"+tmp["WP_id"].capitalize().replace(" ","_")
                    
            
                except json.decoder.JSONDecodeError:
                   pass
                
                ################################################################    
        
                objects = {}
                literals = {datatype: {} for datatype in DATATYPES}
                types = {"P31": []}
                join = {
                    "items": {
                        "id_entity": i,
                        "entity": entity,
                        "description": description,
                        "labels": all_labels,
                        "aliases": all_aliases,
                        "types": types,
                        "popularity": popularity,
                        "kind": category,   # kind (entity, type or predicate, disambiguation or category)
                        ######################
                        # new updates
                        "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
                        "URLs" : url_dict
                        ######################
                    },
                    "objects": { 
                        "id_entity": i,
                        "entity": entity,
                        "objects":objects
                    },
                    "literals": { 
                        "id_entity": i,
                        "entity": entity,
                        "literals": literals
                    },
                    "types": { 
                        "id_entity": i,
                        "entity": entity,
                        "types": types
                    },
                }
            
                predicates = item["claims"]
                for predicate in predicates:
                    for obj in predicates[predicate]:
                        datatype = obj["mainsnak"]["datatype"]
            
                        if check_skip(obj, datatype):
                            continue
            
                        if datatype == "wikibase-item" or datatype == "wikibase-property":
                            value = obj["mainsnak"]["datavalue"]["value"]["id"]
            
                            if predicate == "P31" or predicate == "P106":
                                types["P31"].append(value)
            
                            if value not in objects:
                                objects[value] = []
                            objects[value].append(predicate)    
                        else:
                            value = get_value(obj, datatype)                
                            lit = literals[DATATYPES_MAPPINGS[datatype]]
            
                            if predicate not in lit:
                                lit[predicate] = []
                            lit[predicate].append(value)   
            
                 
            
                for key in buffer:
                    buffer[key].append(join[key])            
            
                if len(buffer["items"]) == BATCH_SIZE:
                    flush_buffer(buffer)
    
        except json.decoder.JSONDecodeError:
            continue
    pbar.close()

In [None]:
json_file_path = "./yago_wiki_classification.json"

data = {
    "ORG": ORG,
    "LOC": LOC,
    "PERS": PERS,
    "OTHERS": OTHERS
}

# Write the categorized data to a JSON file
try:
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Data saved successfully to {json_file_path}")
except Exception as e:
    print(f"Error saving data to JSON file: {e}")

In [None]:
logs = parser.parse_all_logs(log_dir="./")
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")

In [None]:
total_length_PERS = len(PERS)
total_length_ORG = len(ORG)
total_length_LOC = len(LOC)
total_length_OTHERS = len(OTHERS)

# Print the total lengths
print("Total lengths:")
print(f"Length of PERS: {total_length_PERS}")
print(f"Length of ORG: {total_length_ORG}")
print(f"Length of LOC: {total_length_LOC}")
print(f"Length of OTHERS: {total_length_OTHERS}")

# Calculate the sum of lengths
total_length = total_length_PERS + total_length_ORG + total_length_LOC + total_length_OTHERS

# Print the sum of lengths
print(f"Total length: {total_length}")

In [None]:
for el in OTHERS:
    if el in PERS:
        print(f"PERS and ORG --> Entity ID: {PERS.index(el)}")
    if el in LOC:
        print(f"LOC and ORG --> Entity ID: {LOC.index(el)}")
    if el in ORG:
        print(f"OTHERS and ORG --> Entity ID: {ORG.index(el)}")

In [None]:
# Convert lists to sets for faster intersection operation
ORG_set = set(ORG)
PERS_set = set(PERS)
LOC_set = set(LOC)
OTHERS_set = set(OTHERS)

# Initialize counters for each set
ORG_counter = 0
PERS_counter = 0
LOC_counter = 0
OTHERS_counter = 0

# Find the overlapping items and update the counters
for item in ORG_set.union(PERS_set, LOC_set, OTHERS_set):
    num_overlaps = 0
    if item in ORG_set:
        print("item")
        num_overlaps += 1
    if item in PERS_set:
        num_overlaps += 1
    if item in LOC_set:
        num_overlaps += 1
    if item in OTHERS_set:
        num_overlaps += 1
    
    # Update the corresponding counter based on the number of overlaps
    if num_overlaps == 1:
        ORG_counter += 1
    elif num_overlaps == 2:
        PERS_counter += 1
    elif num_overlaps == 3:
        LOC_counter += 1
    elif num_overlaps == 4:
        OTHERS_counter += 1

# Print the counts for each set
print("Number of overlaps for each set:")
print(f"ORG: {ORG_counter}")
print(f"PERS: {PERS_counter}")
print(f"LOC: {LOC_counter}")
print(f"OTHERS: {OTHERS_counter}")

## URL Construction

In [None]:
#! /usr/bin/env python3
# This Python file uses the following encoding: utf-8

__author__ = 'jgeiss'


#############################################################################
# authors: Johanna Geiß, Heidelberg University, Germany                     #
# email: geiss@informatik.uni-heidelberg.de                                 #
# Copyright (c) 2017 Database Research Group,                               #
#               Institute of Computer Science,                              #
#               University of Heidelberg                                    #
#   Licensed under the Apache License, Version 2.0 (the "License");         #
#   you may not use this file except in compliance with the License.        #
#   You may obtain a copy of the License at                                 #
#                                                                           #
#   http://www.apache.org/licenses/LICENSE-2.0                              #
#                                                                           #
#   Unless required by applicable law or agreed to in writing, software     #
#   distributed under the License is distributed on an "AS IS" BASIS,       #
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.#
#   See the License for the specific language governing permissions and     #
#   limitations under the License.                                          #
#############################################################################
# last updated 21.3.2017 by Johanna Geiß

from pymongo import *
from pymongo import errors
import configparser



wikidata_dump_path = './my-data/latest-all.json.bz2'

with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 0
    
             
    for i, line in tqdm(enumerate(f), total=1000):
        if count == 10000:
            break
        try:
            count += 1
            # Parse JSON data from each line
            data = json.loads(line[:-2])
         
            labels = data.get("labels", {})
            lang = labels.get("en", {}).get("language", "")
            entry={}
            entry["WD_id"] = data['id']
            entry["WP_id"] = labels.get("en", {}).get("value", "")

            entry["WD_id_URL"] = "http://www.wikidata.org/wiki/"+entry["WD_id"]
            entry["WP_id_URL"] = "http://"+lang+".wikipedia.org/wiki/"+entry["WP_id"].replace(" ","_")
            entry["dbpedia_URL"] = "http://dbpedia.org/resource/"+entry["WP_id"].capitalize().replace(" ","_")
            
            print("------------------")
            print(entry["WD_id_URL"])
            print(entry["WP_id_URL"])
            print(entry["dbpedia_URL"])
            print("------------------")
    
        except json.decoder.JSONDecodeError:
            continue







In [None]:
from carbontracker import parser

logs = parser.parse_all_logs(log_dir="./")
print(logs)
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")