# Download InTaVia Europeana CH object metadata

In [None]:
%pip install SPARQLWrapper
%pip install pandas

In [42]:
import os
from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON, XML, TURTLE, N3, RDF, RDFXML, CSV, TSV, JSONLD, DIGEST
import pandas as pd
import time
from random import randint
import requests
from urllib import parse, error

In [None]:
# Mount Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

# Specify file location (Google Colab and local)
filelocation_google = '/content/drive/MyDrive/Colab Notebooks/InTaVia/Public/'
filelocation_local = ''

In [36]:
sparql = SPARQLWrapper("http://sparql.europeana.eu/")
# Define Prefix for SPARQL query
prefix = """
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX schema: <http://schema.org/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX dbr: <http://dbpedia.org/resource/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX dbc: <http://dbpedia.org/resource/Category:>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbt: <http://dbpedia.org/resource/Template:>
    PREFIX dbyago: <http://dbpedia.org/class/yago/>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX dul: <http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX gnd: <http://d-nb.info/gnd/>
    PREFIX gold: <http://purl.org/linguistics/gold/>
    PREFIX prov: <http://www.w3.org/ns/prov#>
    PREFIX umbelrc: <http://umbel.org/umbel/rc/>
    PREFIX viaf: <http://viaf.org/viaf/>
    PREFIX ore: <http://www.openarchives.org/ore/terms/>
    PREFIX edm: <http://www.europeana.eu/schemas/edm/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdata: <http://www.wikidata.org/wiki/Special:EntityData/>
    PREFIX wdno: <http://www.wikidata.org/prop/novalue/>
    PREFIX wdref: <http://www.wikidata.org/reference/>
    PREFIX wds: <http://www.wikidata.org/entity/statement/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
    PREFIX wdv: <http://www.wikidata.org/value/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX eventKG-s: <http://eventKG.l3s.uni-hannover.de/schema/>
    PREFIX eventKG-e: <http://eventKG.l3s.uni-hannover.de/resource/>
"""

## 1st Query to create a list of URIs for CH objects based on Wikidata URIs

### Check the number of URIs and providedCHO available

<Stats 2022-7-11>

* URI (http.*): CREATOR URI 444139, CHO 2275236 CONTRIBUTOR 
URI 159680, CHO 600434

* DBpedia (dbpedia.org/): CREATOR URI 25424, CHO 735237 CONTRIBUTOR URI 15371, CHO 237345

* VIAF (viaf.org/): CREATOR URI 2158, CHO 5993 CONTRIBUTOR URI 2, CHO 167

* GND (d-nb.info/gnd/): CREATOR URI 183267, CHO 1152431 CONTRIBUTOR URI 76977, CHO 280193

* sameAs Wikidata (wikidata.org): CREATOR URI 25718, CHO 725359, wdURI 25717 CONTRIBUTOR URI 14938, CHO 234013, wdURI 14938

* sameAs VIAF: CREATOR URI 38987 CHO 474125, wdURI 25825 CONTRIBUTOR URI 9465 CHO 81372 wdURI 8630

* sameAs GND: CREATOR URI 2062, CHO 17619, wdURI 1606, CONTRIBUTOR URI 199, CHO 2934, wdURI 180



In [37]:
query_content = """
select 
(count(distinct ?europeana_proxy) AS ?EuropeanaProxyCount)
(count(distinct ?uri) AS ?URICount)
(count(distinct ?wduri) AS ?wdURICount)
where {
  #?europeana_proxy dc:creator ?uri .
  ?europeana_proxy dc:contributor ?uri .
  ?uri owl:sameAs ?wduri .

  #FILTER REGEX(STR(?uri), "http.*")
  #FILTER REGEX(STR(?uri), ".*dbpedia.org/.*")
  #FILTER REGEX(STR(?wduri), ".*viaf.org/.*")
  #FILTER REGEX(STR(?wduri), ".*d-nb.info/gnd/.*")
  FILTER REGEX(STR(?wduri), ".*wikidata.org.*")
  #FILTER REGEX(STR(?uri), "http(?!(://d-nb.info/gnd/.*|://dbpedia.org/resource/.*))")
}
"""

query = prefix + query_content
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
  #URI = result["uri"]["value"]
  URICount = result["URICount"]["value"]
  EuropeanaProxyCount = result["EuropeanaProxyCount"]["value"]
  wdURICount = result["wdURICount"]["value"]
  #print(URI)
print('Number of URIs for dc:creator & dc:contributor: ' + URICount)
print('Number of Europeana Proxy URIs: ' + EuropeanaProxyCount)
print('Number of Wikidata URIs in sameAs: ' + wdURICount)

Number of URIs for dc:creator & dc:contributor: 14938
Number of Europeana Proxy URIs: 234013
Number of Wikidata URIs in sameAs: 14938


### Function to get URIs for CH objects based on Wikidata URIs

In [None]:
# Specify OFFLIMIT number (e.g. 0, 10000, 20000...)
x = 0

def first_query(x):
  # Specify SPARQL query to get results in JSON
  # As Europeana only permits upper limit as 10000 results, LIMIT 10000 should be set and change OFFSET from 20000 onward
  query_content = """
  select ?europeana_proxy ?uri ?wduri
  where {{
      select DISTINCT ?europeana_proxy ?uri ?wduri
    where {
      #?europeana_proxy dc:creator ?uri .
      ?europeana_proxy dc:contributor ?uri .
      ?uri owl:sameAs ?wduri .

      #FILTER REGEX(STR(?uri), "http.*")
      #FILTER REGEX(STR(?uri), ".*dbpedia.org/.*")
      #FILTER REGEX(STR(?wduri), ".*viaf.org/.*")
      #FILTER REGEX(STR(?wduri), ".*d-nb.info/gnd/.*")
      FILTER REGEX(STR(?wduri), ".*wikidata.org.*")
      #FILTER REGEX(STR(?uri), "http(?!(://d-nb.info/gnd/.*|://dbpedia.org/resource/.*))")
    }
    ORDER BY ASC(?uri)
  }}
  LIMIT 10000
  OFFSET """ + str(x) + """
  """

  query = prefix + query_content
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()

  # Create an empty dataframe, and append data to the dataframe in each iteration
  URI_list_df = pd.DataFrame(columns=['Europeana_proxy', 'External_URI', 'Wikidata_URI'])
  for result in results["results"]["bindings"]:
    Europeana_proxy = result["europeana_proxy"]["value"]
    External_URI = result["uri"]["value"]
    Wikidata_URI = result["wduri"]["value"]
    dicts = {'Europeana_proxy': Europeana_proxy, 'External_URI': External_URI, 'Wikidata_URI': Wikidata_URI}
    URI_list_df = URI_list_df.append(dicts, ignore_index=True, sort=False)
    print(URI_list_df)
  return URI_list_df

URI_list_df = first_query(x)

### Generate an array for iteration of queries,based on the number of ProvidedCHO

Somehow the query goes up to 760,000, while count query suggested 725,451 (reason is still unknown)

In [None]:
# Generate an interation array based on the number of Europeana Proxy
n = int(EuropeanaProxyCount) / 10000
nr = round(n, 0)
print(nr)
n2 = np.array(list(range(0,int(nr)))) * 10000
n2 = np.append(n2, [230000, 240000, 250000, 260000])
#n2 = np.append(n2, [730000, 740000, 750000, 760000])
n2

23.0


array([     0,  10000,  20000,  30000,  40000,  50000,  60000,  70000,
        80000,  90000, 100000, 110000, 120000, 130000, 140000, 150000,
       160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000,
       240000, 250000, 260000])

**Generate CSV files for CH objects/URIs (10000 per file)(Caution for long processing 
time!)**

In [None]:
# Test with 3 iterations
#n2 = np.array([740000, 750000, 760000)
# Iterate over epected number of paginations
for n in n2:
  print('Query ' + str(n) + ' started')
  URI_list_df = first_query(n)
  #Saving URI list as CSV file
  filename = 'URI_list_df_WikidataContributor_' + str(n) + '.csv'
  df_csv = URI_list_df.to_csv(filelocation_google + 'EuropeanaObjectURIlist/' + filename)

  #Time interval for a next query/iteration
  time.sleep(randint(1,2))

## 2nd Queries to fetch CH object metadata (Iterate over URI list from above)



### Function for 2nd query for 1 record (HTTP request)

In [None]:
# Specify any Europeana URIs (i.e. http://data.europeana.eu/item/ or http://data.europeana.eu/proxy/europeana/)
#europeanaURI = 'http://data.europeana.eu/item/2023859/_http___keptar_oszk_hu_025900_025984__'
europeanaURI = 'http://data.europeana.eu/proxy/europeana/2020903/KMS1811'
#europeanaURI = 'http://data.europeana.eu/proxy/provider/2032004/20270'
#europeanaURI = 'http://data.europeana.eu/proxy/europeana/2026116/Partage_Plus_ProvidedCHO_Bildarchiv_Foto_Marburg_obj_20184057_LA_5_957_15a'

def second_query4(europeanaURI):
  headers = {
    'Accept': 'text/turtle',
    'Content-type': 'text/turtle'
  }
  r = requests.get(europeanaURI, headers=headers)
  try:
    print(r.raise_for_status())
    return(r.text)
  except requests.exceptions.HTTPError as e: 
    print(e)
    return

turtle = second_query4(europeanaURI)
print(turtle)

### Create a list of files in the CSV folder

In [None]:
# folder path to CSV files
dir_path = filelocation_google + 'EuropeanaObjectURIlist'
# Create an empty list
list_file = []
# Iterate in the folder
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        list_file.append(path)
list_file.sort()
print(list_file)

### 2nd Queries to fetch Europeana CH object metadata by HTTP request (Caution long processing time!)

In [None]:
# Test with 1 iteration
#list_file = np.array(list_file[-2:])
#print(list_file)
print(list_file)
i = 0
for n in list_file:
  print('Processing ' + str(n) + ' started as no ' + str(i))
  URI_list_df  = pd.read_csv(filelocation_google + 'EuropeanaObjectURIlist/' + n)
  list_Europeana_proxy = (URI_list_df['Europeana_proxy']).to_list()
  list_Wikidata_URI = (URI_list_df['Wikidata_URI']).to_list()

  # Iterate over URI list
  ii = 0
  for item in list_Europeana_proxy:
    print('Processing ' + str(item) + ' started')
    turtle_europeana = second_query4(item)
    # Check if turtle file was fetched
    if turtle_europeana != None:
      filename =  list_Wikidata_URI[ii].rsplit('/', 1)[1]
      print(filename + ' is fetched')
      path_to_file = filelocation_google + 'EuropeanaObjectTurtle/' + str(filename) + '.ttl'
      path = Path(path_to_file)
      # Check if the (same) turtle file was already saved 
      if path.is_file():
        print(f'The file {path_to_file} already exists, thus no need to save and download to local')
      else:
        # Saving as Turtle in GoogleDrive and local machine
        with open(filelocation_google + 'EuropeanaObjectTurtle/' + str(filename) + '.ttl', 'w') as f:
          f.write(turtle_europeana)
        files.download(filelocation_google + 'EuropeanaObjectTurtle/' + str(filename) + '.ttl')
        print(f'The file {path_to_file} does not exist, thus saved and download to local')
    else:
      pass
      print('No Turtle can be fetched from ' + str(item))
    print('Processing ' + str(item) + ' completed')
    print('-----------------')
    #Time interval for a next query/iteration
    time.sleep(randint(1,3))
    ii = ii + 1

  print('-----------------')
  print(n + 'is finished: ' + str(ii) + ' items are processed')
  i = i + 1

# Code to avoid 90 min limit of Google Colab

In [None]:
# Click this cell after start a batch processing above
while True:pass