# Download InTaVia Wikidata CH object metadata

In [None]:
%pip install SPARQLWrapper
%pip install pandas
%pip install numpy
%pip install fsspec
%pip install rdfpandas

In [2]:
import sys
import os
from SPARQLWrapper import SPARQLWrapper, JSON, XML, TURTLE, N3, RDF, RDFXML, CSV, TSV, JSONLD, DIGEST
import pandas as pd
import time
from random import randint
import numpy as np
import requests
from rdflib import Graph
import urllib
import fsspec
from rdfpandas.graph import to_graph, to_dataframe
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import NamespaceManager,CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, VOID, XMLNS, XSD

In [3]:
# Mount Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

# Specify file location (Google Colab and local)
filelocation_google = '/content/drive/MyDrive/Colab Notebooks/InTaVia/Public/'

Mounted at /content/drive


In [4]:
sparql = SPARQLWrapper('https://query.wikidata.org/sparql')
# Define Prefix for SPARQL query
prefix = """
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX schema: <http://schema.org/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX dbr: <http://dbpedia.org/resource/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX dbc: <http://dbpedia.org/resource/Category:>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbt: <http://dbpedia.org/resource/Template:>
    PREFIX dbyago: <http://dbpedia.org/class/yago/>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX dul: <http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX gnd: <http://d-nb.info/gnd/>
    PREFIX gold: <http://purl.org/linguistics/gold/>
    PREFIX prov: <http://www.w3.org/ns/prov#>
    PREFIX umbelrc: <http://umbel.org/umbel/rc/>
    PREFIX viaf: <http://viaf.org/viaf/>
    PREFIX ore: <http://www.openarchives.org/ore/terms/>
    PREFIX edm: <http://www.europeana.eu/schemas/edm/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdata: <http://www.wikidata.org/wiki/Special:EntityData/>
    PREFIX wdno: <http://www.wikidata.org/prop/novalue/>
    PREFIX wdref: <http://www.wikidata.org/reference/>
    PREFIX wds: <http://www.wikidata.org/entity/statement/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
    PREFIX wdv: <http://www.wikidata.org/value/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX eventKG-s: <http://eventKG.l3s.uni-hannover.de/schema/>
    PREFIX eventKG-e: <http://eventKG.l3s.uni-hannover.de/resource/>
"""

## 1. SPARQL to create the Wikidata object list out of Wikidata persons found in InTaVia data (merge_at_nl_fi.csv)

### Count the number of Wikidata objects

In [None]:
def wikidata_object_list():
  endpoint_url = 'https://query.wikidata.org/sparql'
  query_content=f"""
    SELECT 
      (count(distinct ?item) AS ?ItemCount)
      (count(distinct ?creator) AS ?CreatorCount)
    WHERE
    {{
      ?item wdt:P1647*/wdt:P170 ?creator .
      #?item wdt:P31 ?itemtype .
      #?item wdt:P1476 ?title .
      #?item wdt:P571 ?inceptiondate .
      #SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
    }}
  """
  query = prefix + query_content
  sparql = SPARQLWrapper(endpoint_url)
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  #print(results)
  for result in results["results"]["bindings"]:
    ItemCount = result["ItemCount"]["value"]
    CreatorCount = result["CreatorCount"]["value"]
  print('Number of Items: ' + ItemCount)
  print('Number of Creators: ' + CreatorCount)
wikidata_object_list()


{'head': {'vars': ['ItemCount', 'CreatorCount']}, 'results': {'bindings': [{'ItemCount': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '961191'}, 'CreatorCount': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '208437'}}]}}
Number of Items: 961191
Number of Creators: 208437


### Define a function (with testing URIs)

In [6]:
# Testing URIs
#creator_ref = 'http://www.wikidata.org/entity/Q34661'
#creator_ref = 'http://www.wikidata.org/entity/Q736847'
creator_ref = 'http://www.wikidata.org/entity/Q254'
#creator_ref = 'http://www.wikidata.org/entity/Q1000385'
#creator_ref = 'http://www.wikidata.org/entity/Q433652'

def wikidata_object_list(creator_ref):
  endpoint_url = 'https://query.wikidata.org/sparql'
  creator_iriref = '<' + str(creator_ref) + '>'
  query_content=f"""
    SELECT DISTINCT ?item 
    #?title ?itemtype ?itemtypeLabel ?inceptiondate
    WHERE
    {{
      ?item wdt:P1647*/wdt:P170 {creator_iriref} .
      #?item wdt:P31 ?itemtype .
      #?item wdt:P1476 ?title .
      #?item wdt:P571 ?inceptiondate .
      #SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
    }}
  """
  query = prefix + query_content
  user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
  # TODO adjust user agent; see https://w.wiki/CX6
  sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
  #sparql = SPARQLWrapper(endpoint_url)
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  print(results)
  # Create an empty dataframe, and append data to the dataframe in each iteration
  df = pd.DataFrame(columns=['item', 'person'])
  #df = pd.DataFrame(columns=['item', 'title', 'itemtype', 'itemtypeLabel', 'inceptiondate'])
  for result in results["results"]["bindings"]:
    item = result["item"]["value"]
    #title = result["title"]["value"]
    #itemtype = result["itemtype"]["value"]
    #itemtypeLabel = result["itemtypeLabel"]["value"]
    #inceptiondate = result["inceptiondate"]["value"]     
    dicts = {'item': item, 'person': creator_ref}
    #dicts = {'item': item, 'title': title, 'itemtype': itemtype, 'itemtypeLabel': itemtypeLabel, 'inceptiondate': inceptiondate}
    df = df.append(dicts, ignore_index=True, sort=False) 
  return df

df = wikidata_object_list(creator_ref)
df

{'head': {'vars': ['item']}, 'results': {'bindings': [{'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3225216'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q50391437'}}]}}


Unnamed: 0,item,person
0,http://www.wikidata.org/entity/Q3225216,http://www.wikidata.org/entity/Q254
1,http://www.wikidata.org/entity/Q50391437,http://www.wikidata.org/entity/Q254


### Batch processing for the function (Caution: long processing time)

In [None]:
# Get Wikidata URIs from InTaVia data providers
# Testing URIs
#list_Wikidata_URI = ['http://www.wikidata.org/entity/Q34661', 'http://www.wikidata.org/entity/Q736847', 'http://www.wikidata.org/entity/Q1000385', 'http://www.wikidata.org/entity/Q433652']
Wikidata_URI_list_df  = pd.read_csv(filelocation_google + 'PersonURIs_CSV/merge_at_nl_fi.csv')
list_Wikidata_URI = (Wikidata_URI_list_df['wikidataURI']).to_list()
list_Wikidata_URI_0 = list_Wikidata_URI[0:4000]
list_Wikidata_URI_4000 = list_Wikidata_URI[4000:8000]
list_Wikidata_URI_8000 = list_Wikidata_URI[8000:12000]
list_Wikidata_URI_12000 = list_Wikidata_URI[1200:16000]
list_Wikidata_URI_16000 = list_Wikidata_URI[16000:20000]
list_Wikidata_URI_20000 = list_Wikidata_URI[20000:24000]
list_Wikidata_URI_24000 = list_Wikidata_URI[24000:28000]
list_Wikidata_URI_28000 = list_Wikidata_URI[28000:32000]
list_Wikidata_URI_32000 = list_Wikidata_URI[32000:36000]
list_Wikidata_URI_36000 = list_Wikidata_URI[36000:40000]
list_Wikidata_URI_40000 = list_Wikidata_URI[40000:44000]
list_Wikidata_URI_44000 = list_Wikidata_URI[44000:48000]
list_Wikidata_URI_48000 = list_Wikidata_URI[48000:52000]
list_Wikidata_URI_52000 = list_Wikidata_URI[52000:56000]
list_Wikidata_URI_56000 = list_Wikidata_URI[56000:56300]
#print(list_Wikidata_URI_0)
#print(list_Wikidata_URI_4000)

# Create an empty dataframe, and append data to the dataframe in each iteration
df = pd.DataFrame(columns=['item', 'person'])
# Iterate over URI list
for item in list_Wikidata_URI:
    print(str(item) + ' is started')
    df_new = wikidata_object_list(item) 
    df = df.append(df_new, ignore_index=True)   
    print(str(item) + ' completed')
    #Time interval for a next query/iteration
    time.sleep(randint(1,3))
df_csv = df.to_csv(filelocation_google + 'WikidataObjectURIlist/URI_list_df_WikidataObject_0.csv')

http://www.wikidata.org/entity/Q1000005 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q1000005 completed
http://www.wikidata.org/entity/Q100035749 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q100035749 completed
http://www.wikidata.org/entity/Q1000385 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q1000385 completed
http://www.wikidata.org/entity/Q1000555 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q1000555 completed
http://www.wikidata.org/entity/Q1000902 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q1000902 completed
http://www.wikidata.org/entity/Q1000935 is started
{'head': {'vars': ['item']}, 'results': {'bindings': []}}
http://www.wikidata.org/entity/Q1000935 completed
http://www.wikidata.org/entity/Q1000981 is

## 2. HTTP request to fetch the Wikidata object metadata out of the list out of Wikidata object list(URI_list_df_WikidataObject.csv)

### Define a function (with testing URIs)

In [None]:
# Testing URI
wikidataURI = 'http://www.wikidata.org/entity/Q698487'

def fetch_wikidata_metadata(wikidataURI):
  headers = {
    'Accept': 'text/turtle',
    'Content-type': 'text/turtle'
  }
  r = requests.get(wikidataURI, headers=headers)
  try:
    print(r.raise_for_status())
    return(r.text)
  except requests.exceptions.HTTPError as e: 
    print(e)
    return

turtle = fetch_wikidata_metadata(wikidataURI)
print(turtle)

### Batch processing for the function (Caution: long processing time)

In [None]:
URI_list_df  = pd.read_csv(filelocation_google + 'WikidataObjectURIlist/URI_list_df_WikidataObject.csv')
list_item = (URI_list_df['item']).to_list()
list_person = (URI_list_df['person']).to_list()

# Iterate over URI list
i = 0
for item in list_item:
  print('Processing ' + str(item) + ' started')
  turtle_wikidata_object = fetch_wikidata_metadata(item)
  # Check if turtle file was fetched
  if turtle_wikidata_object != None:
    itemid=  item.rsplit('/', 1)[1]
    personid = list_person[i].rsplit('/', 1)[1]
    filename = str(itemid) + '_by_' + str(personid)
    print(filename + ' is fetched')
    # Saving as Turtle (using itemid + personid)
    with open(filelocation_google + 'WikidataObjectTurtle/' + str(filename) + '.ttl', 'w') as f:
      f.write(turtle_wikidata_object)
    files.download(filelocation_google + 'WikidataObjectTurtle/' + str(filename) + '.ttl')
  else:
    pass
    print('HTTP error: No Turtle can be fetched from ' + str(item))
  print('Processing ' + str(item) + ' completed')
  print('-----------------')
  #Time interval for a next query/iteration
  time.sleep(randint(1,3))
  i = i + 1

In [None]:
while True:pass