# Wikidata - External Reference Counts

Count the references to external resources across Wikidata entities that are properties or items that are in a subclassing hierarchy. This is done to avoid the many exact match references due to instances.

Results are shown in cell 3 for IRIs with counts of 50 references and above.

In [1]:
from SPARQLWrapper import SPARQLWrapper, CSV

sparql = SPARQLWrapper("https://qlever.cs.uni-freiburg.de/api/wikidata")    # QLever
# WDQS: sparql = SPARQLWrapper("https://query.wikidata.org/sparql")         # Not used due to time outs

# Checking for super-property wdt:P2235, sub-property wdt:P2236, and equivalent property wdt:P1628 
query_prop_refs = 'PREFIX wdt: <http://www.wikidata.org/prop/direct/> ' \
                  'PREFIX wikibase: <http://wikiba.se/ontology#> ' \
                  'SELECT DISTINCT ?prop ?extPred ?ext WHERE { ' \
                      'VALUES ?extPred {wdt:P2235 wdt:P2236 wdt:P1628} ' \
                      '?prop ?extPred ?ext . ' \
                      'FILTER (!CONTAINS(str(?ext),"http://www.wikidata.org")) }'  # Must be external ref 

# Checking for equivalent class P1709, exact match P2888, or narrower external class P3950
# Checking for super/sub-class using the P279 property
query_class_refs = 'PREFIX wdt: <http://www.wikidata.org/prop/direct/> ' \
                   'SELECT DISTINCT ?item ?extPred ?ext WHERE { ' \
                       'VALUES ?extPred {wdt:P1709 wdt:P2888 wdt:P3950} ' \
                       '?item ?extPred ?ext . ' \
                       '{ {?item wdt:P279 ?x} UNION {?y wdt:P279 ?item} } ' \
                       'FILTER (!CONTAINS(str(?ext),"http://www.wikidata.org")) }'  # Must be external ref 


# Run query
def run_query(query_str: str):
    sparql.setQuery(query_str)
    sparql.setReturnFormat(CSV)
    results = sparql.query()
    if str(results.response.status) != "200":
        print('error', results.response.status, query_str)
        return None
    elif "retry-after" in results.info():    # If using WDQS, avoid hitting query limits for WDQS
        time.sleep(int(info['retry-after']))
        return run_query(query_str)
    else:
        return results.convert().decode()    # Returns byte string

In [2]:
queries = (query_prop_refs, query_class_refs)
ext_iris = dict()

for query in queries:
    csv_results = run_query(query)
    if csv_results is not None:
        csv_lines = csv_results.split("\n")
        for line in csv_lines:
            if not line:
                continue
            external = line.split(",")[2]
            if not external.startswith("http"):
                continue
            iri_segments = external.split("//")[1].split("/")
            final_iri = iri_segments[0]
            if "purl.org" in final_iri:
                final_iri = final_iri + "/" + iri_segments[1]
            cnt = 0
            if final_iri in ext_iris:
                cnt = ext_iris[final_iri]
            final_cnt = cnt + 1
            ext_iris[final_iri] = final_cnt
    else:
        print("Failed to process query: " + query)

In [3]:
sorted_references = sorted(ext_iris.items(), key=lambda x:x[1], reverse=True)  # Result is an array of tuples
for iri, count in sorted_references:
    if count > 50:   # Output results if more than 50 references
        print(iri, count)

identifiers.org 199777
purl.obolibrary.org 57677
www.orpha.net 8573
publications.europa.eu 5845
www.rhea-db.org 4392
schema.org 791
wordnet-rdf.princeton.edu 543
www.tcdb.org 494
dbpedia.org 424
purl.uniprot.org 406
www.ncbi.nlm.nih.gov 198
www.w3.org 161
www.lexinfo.net 141
id.loc.gov 125
www.uniprot.org 95
pcp-on-web.de 92
purl.org/ontology 79
purl.org/dc 78
purl.org/spar 71
cv.iptc.org 64
d-nb.info 56
purl.org/coar 56
