<div class="alert alert-success">
Jump to the final section for the generation of gazetteers from prefetched Wikidata queries
</div>

In [None]:
import time
import json
import pickle 

import requests
import pandas as pd

from tqdm.notebook import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

# Get all subclasses labels

In [None]:
agent = ""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=agent)

classes = {
    "Organization": "Q43229", #includes companies
    "Name": "Q82799",
    "Artist": "Q483501",
    "Geolocation": "Q2221906",
    "City": "Q515",
    "Capital": "Q5119",
    "Town": "Q3957",
    "Demonym": "Q217438",
    "Product": "Q2424752",
    "Brand": "Q431289",
    "Georegion": "Q82794",
    "Country": "Q6256",
    "Given name": "Q202444",
    "Family name": "Q101352"
}

In [None]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
subclasses = {}

for label, wikidata_code in tqdm(classes.items()):
    try:
        print(label, wikidata_code)
        query = "SELECT distinct ?class ?classLabel " + \
                "WHERE { ?class wdt:P279|wdt:P279/wdt:P279 wd:" + wikidata_code + ". " + \
                "        SERVICE wikibase:label { bd:serviceParam wikibase:language 'en' } }"
        data = requests.get(url, params={'query': query, 'format': 'json'}).json()
        # time.sleep(1)
        subclasses[label] = data
    except Exception as e:
        print(str(e))

In [None]:
subclasses = pickle.load(open('subclasses.pickle', 'rb'))

In [None]:
subclasses.keys()

In [None]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'

for label, wikidata_code in tqdm(classes.items()):
    try:
        if label not in ["Family name", "Given name"]:
            continue
        print(label, wikidata_code)
        query = "SELECT distinct ?class ?classLabel " + \
                "WHERE { ?class wdt:P279|wdt:P279/wdt:P279 wd:" + wikidata_code + ". " + \
                "        SERVICE wikibase:label { bd:serviceParam wikibase:language 'en' } }"
        data = requests.get(url, params={'query': query, 'format': 'json'}).json()
        # time.sleep(1)
        subclasses[label] = data
    except Exception as e:
        print(str(e))

In [None]:
# subclasses = pickle.load(open('subclasses.pickle', 'rb'))

In [None]:
for label in subclasses:
    print(label)
    print(len(subclasses[label]['results']['bindings']), 'subclasses')

In [None]:
print(subclasses['Organization']['results']['bindings'][0])

In [None]:
subclasses_data = []
counter = 0
for class_label in subclasses:
    for result in subclasses[class_label]['results']['bindings']:
        subclass, subclass_label = result['class']['value'], result['classLabel']['value']
        if subclass_label.split('Q')[-1].isnumeric():
            counter += 1
            continue
        
        subclasses_data.append({ # 'class_uri': subclass,
                                'class_qid': subclass.split('/')[-1],
                                'class_label': class_label,
                                'subclass_label': subclass_label.lower()})

counter

In [None]:
pickle.dump(subclasses, open('subclasses.pickle', 'wb'))
subclasses_df = pd.DataFrame(subclasses_data)
subclasses_df.to_csv('csv/subclasses_df.csv', index=False)

In [None]:
subclasses_df.head()

In [None]:
pd.DataFrame(unused_classes).head()

In [None]:
pd.DataFrame(unused_classes).to_csv('csv/unused_subclasses.csv', index=False)

# Count the number of instances from every class

In [None]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
instances = {}

for label, wikidata_code in classes.items():
    try:
        print(label, wikidata_code)
        query = "SELECT ?entity " + \
                "WHERE { ?entity wdt:P31/wdt:P279* wd:" + wikidata_code + ". " + \
                "        SERVICE wikibase:label { bd:serviceParam wikibase:language 'en' } }"
        data = requests.get(url, params={'query': query, 'format': 'json'}).json()
        instances[label] = data['results']['bindings']
        # print(len(results[label]), 'of results returned.')
    except Exception as e:
        print(str(e))

In [None]:
for label in classes:
    print(label)
    print(len(instances[label]), 'results')

# Get the classes for each entity

In [None]:
results_classes.keys()

In [None]:
# From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
# results_classes = {}
limit = 10000

for label, wikidata_code in tqdm(classes.items()):
    if label != 'Family name':
        continuesubclass2class = {}
for key in subclasses:
    for sc in subclasses[key]:
        subclass2class[sc] = key
pickle.dump(subclass2class, open('subclass2class.pickle', 'wb'))subclass2class = {}
for key in subclasses:
    for sc in subclasses[key]:
        subclass2class[sc] = key
pickle.dump(subclass2class, open('subclass2class.pickle', 'wb'))
    print(label, wikidata_code)
    results_classes[label] = []
    offset = 0
    while True:
        try:
             #  ?entity ?entityLabel ?entityAltLabel ?entityDescription
            query = """
                    SELECT distinct  ?entity (GROUP_CONCAT(?class ; SEPARATOR = ';') as ?classes)
                    WHERE 
                    {{
                         ?entity wdt:P31|wdt:P31/wdt:P279|wdt:P31/wdt:P279/wdt:P279 wd:{};
                                 wdt:P31 ?class.
                    }}
                    GROUP BY ?entity
                    OFFSET {}
                    LIMIT {}
                    """.format(wikidata_code, offset*limit, limit)
            if offset == 0:
                print(query)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            offset += 1
            if len(sparql.query().convert()['results']['bindings']) > 0:
                result = sparql.query().convert()['results']['bindings']
                results_classes[label].append(result)
                print(f'{len(results_classes[label][-1])} of results returned at offset {offset}')
            else:
                print(f'No more results returned (offset {offset})')
                break
        except Exception as e:
            if str(e).startswith('EndPointInternalError'):
                e = 'Wikidata TimeoutException'
                offset -= 1
                time.sleep(30)
            print(f'Exception at label {label} (offset {offset}) : {e}')
            

In [None]:
print(results_classes['Name'][0][1])

In [None]:
entity_classes_data = []

for class_label in results_classes:
    results = [x for l in results_classes[class_label] for x in l]
    for result in results:
        entity_classes_data.append({
            'entity_qid': result['entity']['value'].split('/')[-1],
            'entity_label': class_label,
            'entity_classes': ';'.join([s.split('/')[-1] for s in result['classes']['value'].split(';')]),
        })

In [None]:
# pickle.dump(results_classes, open('results_classes.pickle', 'wb'))

In [None]:
df_entity_classes = pd.DataFrame(entity_classes_data)
df_entity_classes

In [None]:
results_classes.keys()

In [None]:
df_entity_classes.to_csv('csv/entity_classes.csv', index=False)

# Get the labels for each entity

In [None]:
%%time
# From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
labels_results = {}
limit = 20000

for label, wikidata_code in tqdm(classes.items()):
    print(label, wikidata_code)
    labels_results[label] = []
    offset = 0
    while True:
        try:
             #  ?entity ?entityLabel ?entityAltLabel ?entityDescription
            query = """
                    SELECT DISTINCT ?entity ?entityLabel ?entityAltLabel ?entityDescription
                    WHERE
                    {{
                      {{
                        SELECT DISTINCT ?entity WHERE {{
                          ?entity wdt:P31|wdt:P31/wdt:P279|wdt:P31/wdt:P279/wdt:P279 wd:{};
                                  wdt:P31 ?class
                        }}
                        OFFSET {}
                        LIMIT {}
                      }}
                      SERVICE wikibase:label {{ bd:serviceParam wikibase:language 'en' }}
                    }}
                    """.format(wikidata_code, offset*limit, limit)
            if offset == 0:
                print(query)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            offset += 1
            if len(sparql.query().convert()['results']['bindings']) > 0:
                result = sparql.query().convert()['results']['bindings']
                labels_results[label].append(result)
                print(f'{len(labels_results[label][-1])} of results returned at offset {offset}')
            else:
                print(f'No more results returned (offset {offset})')
                break
        except Exception as e:
            print(f'Exception at label {label} (offset {offset}) : {e}')

In [None]:
limit = 10000

for label, wikidata_code in tqdm(classes.items()):
    if label != 'Family name':
        continue
    print(label, wikidata_code)
    labels_results[label] = []
    offset = 0
    while True:
        try:
             #  ?entity ?entityLabel ?entityAltLabel ?entityDescription
            query = """
                    SELECT DISTINCT ?entity ?entityLabel ?entityAltLabel ?entityDescription
                    WHERE
                    {{
                      {{
                        SELECT DISTINCT ?entity WHERE {{
                          ?entity wdt:P31|wdt:P31/wdt:P279|wdt:P31/wdt:P279/wdt:P279 wd:{};
                                  wdt:P31 ?class
                        }}
                        OFFSET {}
                        LIMIT {}
                      }}
                      SERVICE wikibase:label {{ bd:serviceParam wikibase:language 'en' }}
                    }}
                    """.format(wikidata_code, offset*limit, limit)
            if offset == 0:
                print(query)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            offset += 1
            if len(sparql.query().convert()['results']['bindings']) > 0:
                result = sparql.query().convert()['results']['bindings']
                labels_results[label].append(result)
                print(f'{len(labels_results[label][-1])} of results returned at offset {offset}')
            else:
                print(f'No more results returned (offset {offset})')
                break
        except Exception as e:
            print(f'Exception at label {label} (offset {offset}) : {e}')

In [None]:
labels_results.keys()

In [None]:
for l in labels_results:
    print(l, len(labels_results[l]))

In [None]:
pickle.dump(labels_results, open('gazetteers_v1.2_family_names_added_results.pickle', 'wb'))
# labels_results = pickle.load(open('gazetteers_v1.0_results.pickle', 'rb'))

In [None]:
labels_results['Name'][0][1]

In [None]:
entity_labels_data = []

for class_label in tqdm(labels_results):
    results = [x for l in labels_results[class_label] for x in l]
    for result in results:
        entity_labels_data.append({
            'entity_qid': result['entity']['value'].split('/')[-1],
            'entity_class': class_label,
            'entity_label': '' if 'entityLabel' not in result else result['entityLabel']['value'],
            'entity_alt_label': '' if 'entityAltLabel' not in result else result['entityAltLabel']['value'],
            'entity_description':  '' if 'entityDescription' not in result else result['entityDescription']['value'],
        })

In [None]:
df_entity_labels = pd.DataFrame(entity_labels_data)
df_entity_labels

In [None]:
df_entity_labels.to_csv('csv/entity_labels.csv', index=False)

# Combine Everything Together 

In [None]:
subclasses_df = pd.read_csv('csv/subclasses_df.csv')
df_entity_labels = pd.read_csv('csv/entity_labels.csv')
df_entity_classes = pd.read_csv('csv/entity_classes.csv')

<div class="alert alert-warning">
df_entity_labels contains more entries because the for Geolocation and Georegion it only goes 1 link deeper
</div>

In [None]:
print(len(df_entity_labels))
df_entity_labels.head(3)

In [None]:
print(len(df_entity_classes))
df_entity_classes.head(3)

In [None]:
print(len(subclasses_df))
subclasses_df.head(3)

In [None]:
subclass2class = dict(zip(subclasses_df.subclass_label, subclasses_df.class_label.str.lower()))
pickle.dump(subclass2class, open('subclass2class.pickle', 'wb'))

In [None]:
qid2class = dict(zip(df_entity_labels.entity_qid, df_entity_labels.entity_class))
qid2label = dict(zip(df_entity_labels.entity_qid, df_entity_labels.entity_label))
qid2altlabel = dict(zip(df_entity_labels.entity_qid, df_entity_labels.entity_alt_label))

In [None]:
subclass2class = dict(zip(subclasses_df.class_qid, subclasses_df.class_label))
subclass2label = dict(zip(subclasses_df.class_qid, subclasses_df.subclass_label))

In [None]:
entity_per_class = {}
not_found = []

for i, entry in tqdm(df_entity_classes.iterrows(), total=len(df_entity_classes)):
    classes = entry['entity_classes']
    label = entry['entity_label']
    qid = entry['entity_qid']
    
    for cls in classes.split(';'):
        if cls not in subclass2label:
            not_found.append(cls)
            cls_label = label.lower()
        else:
            cls_label = subclass2label[cls].lower()
        
        if cls_label not in entity_per_class:
            entity_per_class[cls_label] = set()
        
        entity_per_class[cls_label].add(qid)

In [None]:
any(subclasses_df.subclass_label == 'male given name')

In [None]:
[l for l in entity_per_class.keys() if 'name' in l.lower()]

In [None]:
subclass2labels = {}

for subclass in entity_per_class:
    subclass2labels[subclass] = set()
    for entity in entity_per_class[subclass]:
        if entity in qid2label and len(qid2label[entity].split(' ')) == 1:
            subclass2labels[subclass].add(qid2label[entity].lower())

In [None]:
for l in subclass2labels:
    if 'abdul' in subclass2labels[l]:
        print(l)

In [None]:
[(len(es), l) for l, es in sorted(subclass2labels.items(), key=lambda x: -len(x[1]))[:30]]

In [None]:
label2subclasses = {}

for subclass in subclass2labels:
    for label in subclass2labels[subclass]:
        if label not in label2subclasses:
            label2subclasses[label] = []
        label2subclasses[label].append(subclass)

In [None]:
label2subclasses['morocco']

In [None]:
pickle.dump(subclass2labels, open('subclass2labels.pickle', 'wb'))
pickle.dump(label2subclasses, open('label2subclasses.pickle', 'wb'))

# Pick the final categories

In [None]:
[(len(es), l) for l, es in sorted(subclass2labels.items(), key=lambda x: -len(x[1]))[:300]]