# Create Sample Wikidata

* Translate Stream Updater JSON to SPARQL UPDATE (INSERT/DELETE DATA) statements
** To be used in testing of the stress test infrastructure and process
** Saved as sparql-update.txt
* Capture the earliest (first) triple for each deleted subject/predicate pair in a new file, deleted-triples.nt
* Reverse the INSERT/DELETE sequence to completely restore a Wikidata RDF load
* Transform the CSV data used in testing the query analysis infrastructure to N-triples 
** 3 files were provided with entities from the human, scholarly articles, taxon, gene and film subgraphs
* Manually add the results from translating the 3 files above to the deleted-triples.nt file, to create wikidata-subset.nt

In [1]:
import json
import csv
import queue

## Process the Steaming Updater JSON

In [2]:
# Take dump of Stream Updater data and convert to SPARQL INSERT/DELETE statements
with open("wikidata_update_stream_6k_edits_20220531.ndjson", "r") as update_data:
    with open('sparql-update.txt', 'w') as sparql_update:
        while True:
            line = update_data.readline()  # Read an entry
            if not line: 
                break
            entry = json.loads(line)
            operation = entry["operation"]
            if operation == "reconcile":   # Reconciles are not relevant at this time
                continue
            request = ''
            if "rdf_added_data" in entry.keys():
                request += "INSERT DATA { " + \
                    entry["rdf_added_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            if "rdf_deleted_data" in entry.keys():
                request += "DELETE DATA { " + \
                    entry["rdf_deleted_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            if "rdf_linked_data" in entry.keys():
                request += "INSERT DATA { " + \
                    entry["rdf_linked_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            # Ignore rdf_unlinked_data
            sparql_update.write(request)
            # Note that operations with sequence_lengths different than 1 are not treated differently
            # All information in the JSON output is written as complete triples

In [3]:
# Add first occurrence of each "deleted" subj/predicate triple to an output file
# May miss a few, unique triples if the object can be multi-valued
triples = dict()
with open('sparql-update.txt', 'r') as sparql_update:
    while True:
        line = sparql_update.readline()
        if not line:
            break
        if line.startswith("INSERT "):   # Only processing DELETED triples
            continue
        del_data = line.split("{")[1].split("}")[0]
        statements = del_data.split(" . ")
        for statement in statements:
            subj = statement.split('<')[1].split(">")[0]
            clauses = statement.split(' ; ')
            first_clause = True
            for clause in clauses:
                if clause.endswith(" ."):
                    clause = clause[:-2]
                if ' a ' in clause:
                    pred = '-a'
                else:
                    if first_clause:
                        pred = clause.split('<')[2].split('>')[0]    # First clause has full subj-pred-obj
                    else:
                        pred = clause.split('<')[1].split('>')[0]    # Clause is only pred-obj
                if subj+pred not in triples.keys():
                    if first_clause:    
                        triples[subj+pred] = f"{clause.strip()} ."
                    else:
                        triples[subj+pred] = f"<{subj}> {clause.strip()} ."
                if first_clause:
                    first_clause = False
with open("deleted-triples.nt", "w") as new:
    for key, value in triples.items():
        new.write(value + "\n")

In [4]:
# To account for re-adding any new triples or deleting any inserted ones
# When operating with a complete dump of the Wikidata RDF
# Process the Updater's INSERTs/DELETEs in reverse order, and also reverse the requests such that INSERTs become DELETEs and vice-versa
lifo = queue.LifoQueue()
with open('sparql-update.txt', 'r') as sparql_update:
    while True:
        line = sparql_update.readline()
        if not line:
            break
        if line.startswith("DELETE "):
            new_line = line.replace("DELETE DATA", "INSERT DATA")
        else:
            new_line = line.replace("INSERT DATA", "DELETE DATA")
        lifo.put(new_line)
with open("restore_wikidata.txt", "w") as new:
    while not lifo.empty():
        new.write(lifo.get())

# Process the Sample RDF Used for Query Analysis Code Testing

In [5]:
# Convert CSV from the test infrastructure for query analyses, into triples
for input_file in ('subgraphs', 'scholarly_articles', 'scholarly_articles_and_authors'):
    with open(f'{input_file}.csv', newline='') as csvfile:
        with open(f'{input_file}.nt', 'w') as wikidata:
            reader = csv.DictReader(csvfile)
            for row in reader:
                wikidata.write(f"{row['subject']} {row['predicate']} {row['object']} .\n")