# Create Sample Wikidata

* Translate Stream Updater JSON to SPARQL UPDATE (INSERT/DELETE DATA) statements
** To be used in stress testing
** Saved as sparql-update.ru
* Capture the earliest (first) triple for each deleted subject/predicate pair in a new file, deleted-triples.nt
* Create subset of Wikidata using subgraphs-5.csv, a test set used in query analyses
** The CSV data is used to create an initial, "small" set of triples for Wikidata compliance testing
** Saved as query-triples.nt
* Manually add the deleted-triples.nt triples to the file, query-triples.nt, to create wikidata-subset.nt

In [1]:
import json
import csv

In [2]:
# Take dump of Stream Updater data and convert to SPARQL INSERT/DELETE statements
with open("wikidata_update_stream_6k_edits_20220531.ndjson", "r") as update_data:
    with open('sparql-update.ru', 'w') as sparql_update:
        while True:
            line = update_data.readline()  # Read an entry
            if not line: 
                break
            entry = json.loads(line)
            operation = entry["operation"]
            if operation == "reconcile":   # Reconciles are not relevant at this time
                continue
            request = ''
            if "rdf_added_data" in entry.keys():
                request += "INSERT DATA { " + \
                    entry["rdf_added_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            if "rdf_deleted_data" in entry.keys():
                request += "DELETE DATA { " + \
                    entry["rdf_deleted_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            if "rdf_linked_data" in entry.keys():
                request += "INSERT DATA { " + \
                    entry["rdf_linked_data"]["data"].replace("\n", " ").replace("\t", "").strip() + "}\n"
            # Ignore rdf_unlinked_data
            sparql_update.write(request)
            # Note that operations with sequence_lengths different than 1 are not treated differently
            # All information in the JSON output is written as complete triples

In [3]:
# Add "deleted" triples to the current input TTL
triples = dict()
with open('sparql-update.ru', 'r') as sparql_update:
    while True:
        line = sparql_update.readline()
        if not line:
            break
        if line.startswith("INSERT "):
            continue
        ttl = line.split("{")[1][:-1]
        statements = ttl.split(" . ")
        for statement in statements:
            spo = statement.split("<")
            subj = spo[1].split(">")[0]
            pred = spo[2].split(">")[0]
            if subj+pred not in triples.keys():
                if statement.endswith("}"):
                    triples[subj+pred] = statement.strip().replace("}", "")
                else:
                    triples[subj+pred] = f"{statement.strip()} ."
with open("deleted-triples.nt", "w") as new:
    for key, value in triples.items():
        new.write(value + "\n")

In [5]:
# Convert CSV from query analyses work into TTL
with open('subgraphs_5.csv', newline='') as csvfile:
    with open('query-triples.nt', 'w') as wikidata:
        reader = csv.DictReader(csvfile)
        for row in reader:
            wikidata.write(f"{row['subject']} {row['predicate']} {row['object']} .\n")