## Making TSVs from ~200 individuals that are both in Enslaved and Wikidata.

### Each TSV should contain the triples from Wikidata as:
Subject ->  Predicate   -> Object


In [13]:
import json
import tqdm
import os
import requests

In [14]:
# Dictionary to store found QIDs to avoid redundant API calls
foundQIDs = {}
QIDCount = {}

# Dictionary to store found Props to avoid redundant API calls
foundProps = {}
PropsCount = {}

# Stores any errors to then print out at the end
errors = []

#Base Query URL for  Wikidata
baseURL = "https://wikidata.org/w/rest.php/wikibase/v0"

In [15]:
# Queries WikidataProps.json to find the property label
def findQID(str):
    if str in foundQIDs:
        #print(f"Found {str}: {foundQIDs[str]}")
        QIDCount.update({str: QIDCount.get(str, 0) + 1})
        return foundQIDs[str]
    else:
        endURL = f"/entities/items/{str}/labels/en"
        response = requests.get(baseURL + endURL)
        if response.status_code != 200:
            errors.append(f"Error: {response.status_code} for {str}")
            return None
        else:
            label = response.text.replace("\"", "")
            foundQIDs.update({str: label})
            QIDCount.update({str: 0})
            return label

In [16]:
# Queries WikidataProps.json to find the property label
def findProperty(str):
    if str in foundProps:
#       #print(f"Found Property {str} in foundProps: {foundProps[str]}")
        PropsCount.update({str: PropsCount.get(str, 0) + 1})
        return foundProps[str]
    else:
        endURL = f"/entities/properties/{str}/labels/en"
        response = requests.get(baseURL + endURL)
        if response.status_code != 200:
            errors.append(f"findProperty Error: {response.status_code} for {str}")
            return None
        else:
            label = response.text.replace("\"", "")
            foundProps.update({str: label})
            PropsCount.update({str: 0})
 #          #print(foundProps[str])
            return label

In [17]:
# Dictionary to store the statement nodes that contain references and qualifiers
nodes = {}

# Stores any errors to then print out at the end
errors = []

# Processes statements that start with WDQ:
def processWD_Q(chunk):
    subject = findQID(chunk.split(" ")[0].split(":")[1])
    predicate = chunk.split(" ")[1]
    if predicate.startswith("p:") | predicate.startswith("wdt:p"):
        predicate = findProperty(predicate.split(":")[1])
    object = chunk.split(" ")[2]
    if object.startswith("s:"):
        statementNodes.append(chunk.split("s:")[1])
    new_row = {"subject": subject, "predicate": predicate, "object": object}
    statementList.append(new_row)
    #print(new_row)

# Processes statements that start with WDQ:
def processWDT_P(chunk):
    subject = chunk.split(" ")[0]
    subject = findProperty(subject.split(":")[1])
    predicate = chunk.split(" ")[1]
    if predicate.startswith("p:") or predicate.startswith("wdt:"):
        predicate = findProperty(predicate.split(":")[1])
    new_row = {"subject": subject, "predicate": predicate, "object": object}
    statementList.append(new_row)
    #print(new_row)

def processS(chunk):
    statementID = chunk.split(";")[0].split(" ")[0]
    statementProp = findProperty(chunk.split(";")[2].split(" ")[2].split(":")[1])
    #print(statementProp)
    #print(chunk))
    statementValue = chunk.split(";")[2].split(" ")[3]
    if statementValue.startswith("wd:"):
        statementValue = findQID(statementValue.split(":")[1])
    #print(f"{statementProp} {statementValue}")
    for statement in statementList:
        if statement["object"] == statementID:
            statement["object"] = statementValue

# Processes statements that start with S:
# def processStatementNode(nodeID):
#    statementProp = findProperty(chunk.split(";")[2].split(" ")[2].split(":")[1])
#    #print(statementProp)
#    #print(chunk))
#    statementValue = chunk.split(";")[2].split(" ")[3]
#    if statementValue.startswith("wd:"):
#        statementValue = findQID(statementValue.split(":")[1])
#    #print(f"{statementProp} {statementValue}")
#    for statement in statementList:
#        if statement["object"] == statementID:
#            statement["object"] = statementValue


# Processes references (not implemented yet)
#def processRef(chunk):
#   #print(chunk)


# Processes chunks based on their statement type
def processChunk(chunk):
    chunk = chunk.replace("\n", " ").replace("\t", " ")
    if (
        chunk.startswith(" <https://")
        | chunk.startswith(" data:")
        | chunk.startswith("@prefix")
        | chunk.startswith("xsd:")
        | chunk.startswith("ontolex:")
        | chunk.startswith("dct:")
        | chunk.startswith("rdfs:")
        | chunk.startswith("owl:")
        | chunk.startswith("skos:")
        | chunk.startswith("schema:")
        | chunk.startswith("cc:")
        | chunk.startswith("geo:")
        | chunk.startswith("prov:")
        | chunk.startswith("data")
        | chunk.startswith("<")
    ):
        pass
    else:
        chunk = chunk.lstrip()
        output.write(chunk + "\n")
        if chunk.startswith("wd:Q"):
            processWD_Q(chunk)
        if chunk.startswith("wdt:P"):
            processWDT_P(chunk)
        if chunk.startswith("s:"):
            statementNodes.append(chunk)
            processS(chunk)
        if chunk.startswith("ref:"):
            referenceNodes.append(chunk)
        if chunk.startswith("v:"):
            valueNodes.append(chunk)
        else:
            pass

ttl_files_dir = "./TTLFiles/"

# Processes each file in the directory, line by line, and writes the results to a TSV file
for filename in tqdm.tqdm((os.listdir(ttl_files_dir))):
#for filename in os.listdir(ttl_files_dir):
    statementList = []
    chunkList = []
    statementNodes = []
    referenceNodes = []
    valueNodes = []

    if filename.endswith(".ttl"):
        file_path = os.path.join(ttl_files_dir, filename)
        with open(file_path, "r") as file:
            chunk = ""
            name = filename.split(".")[0].replace(" ", "_")
            output = open(f"./ChunkOutput/v5-{name}-chunkOutput.txt", "a")

            # Iterate over each line in the file
            for line in file:
                # Append the line to the current chunk
                chunk += line

                # Check if the line ends with a '.'
                if line.strip().endswith("."):
                    # Process the chunk
                    processChunk(chunk)
                    # Reset the chunk
                    chunk = ""

        with open(f"./TSVFilesV6/{name}.tsv", "w") as tsvFile:
            name = statementList[0]["subject"]
            for statement in statementList:
                #print(statement)
                if statement["subject"] == name:
                    tsvFile.write(
                        f"{statement['subject']}\t{statement['predicate']}\t{statement['object']}\n"
                    )
    output.close()
    tsvFile.close()
    file.close()
    #for node in statementNodes:
    #   #print(node)

with open("properties.txt", "w") as propOutput:
    for prop in sorted(foundProps):
        propOutput.write(f"{prop}\t{foundProps[prop]}\t{PropsCount[prop]}\n")
    propOutput.close()
    
with open("QIDs.txt", "w") as qidOutput:
    for qid in sorted(foundQIDs):
        qidOutput.write(f"{qid}\t{foundQIDs[qid]}\t{QIDCount[qid]}\n")
    qidOutput.close()
    
with open("errors.txt", "w") as file:
    file.write(f"{filename} Errors:" + "*"*50 + "\n\n")
    for error in errors:
        file.write(error + "\n")
    file.write("\n\n")
    file.close()

 32%|███▏      | 43/133 [03:59<08:20,  5.56s/it]


IndexError: list index out of range