## Making TSVs from Enslaved TTL Files for individuals that exist on both Enslaved and Wikipedia

### Each TSV should contain the triples from Wikidata as:
Subject ->  Predicate   -> Object


In [32]:
import json
import tqdm
import os
import requests
from icecream import ic

In [33]:
# Dictionary to store found QIDs to avoid redundant API calls
foundQIDs = {}
QIDCount = {}

# Dictionary to store found Props to avoid redundant API calls
props = {}
with open("enslavedPropList.tsv", "r") as f:
    for line in f:
        line = line.strip().split("\t")
        props.update({line[0]: line[1]})
PropsCount = {}
ic(len(props))

# Stores any errors to then print out at the end
errors = []

#Base Query URL for  Enslaved
baseURL = "https://lod.enslaved.org/w/api.php?action=wbgetentities&format=json&ids="

ic(requests.get(baseURL + "P31").json()["entities"]["P31"]["labels"]["en"]["value"])

ic| len(props): 83
ic| requests.get(baseURL + "P31").json()["entities"]["P31"]["labels"]["en"]["value"]: 'hasSex'


'hasSex'

In [26]:
# Queries EnslavedProps.json to find the property label
def findQID(str):
    if str in foundQIDs:
        #print(f"Found {str}: {foundQIDs[str]}")
        QIDCount.update({str: QIDCount.get(str, 0) + 1})
        return foundQIDs[str]
    else:
        endURL = f"{str}"
        response = requests.get(baseURL + endURL)
        if response.status_code != 200:
            errors.append(f"Error: {response.status_code} for {str}")
            return None
        else:
            label = response.json()["entities"][str]["labels"]["en"]["value"]
            foundQIDs.update({str: label})
            QIDCount.update({str: 0})
            return label

In [34]:
# Queries WikidataProps.json to find the property label
def findProperty(str):
    if str in props:
#       #print(f"Found Property {str} in foundProps: {foundProps[str]}")
        PropsCount.update({str: PropsCount.get(str, 0) + 1})
        return props[str]
    else:
        endURL = f"{str}"
        response = requests.get(baseURL + endURL)
        if response.status_code != 200:
            errors.append(f"findProperty Error: {response.status_code} for {str}")
            return None
        else:
            label = response.json()["entities"][str]["labels"]["en"]["value"]
            props.update({str: label})
            PropsCount.update({str: 0})
 #          #print(foundProps[str])
            return label

In [39]:

# Dictionary to store the statement nodes that contain references and qualifiers
nodes = {}

# Stores any errors to then print out at the end
errors = []

# Processes statements that start with WDQ:
def processED_Q(chunk):
    subject = findQID(chunk.split(" ")[0].split(":")[1])
    predicate = chunk.split(" ")[1]
    if predicate.startswith("ep:") | predicate.startswith("edt:p"):
        predicate = findProperty(predicate.split(":")[1])
    object = chunk.split(" ")[2]
    if object.startswith("wds:"):
        statementNodes.append(chunk.split("wds:")[1])
    new_row = {"subject": subject, "predicate": predicate, "object": object}
    statementList.append(new_row)
    #ic(new_row)

# Processes statements that start with WDQ:
def processEDT_P(chunk):
    subject = chunk.split(" ")[0]
    subject = findProperty(subject.split(":")[1])
    predicate = chunk.split(" ")[1]
    if predicate.startswith("ep:") or predicate.startswith("edt:"):
        predicate = findProperty(predicate.split(":")[1])
    new_row = {"subject": subject, "predicate": predicate, "object": object}
    statementList.append(new_row)
    #ic(new_row)

def processS(chunk):
    statementID = chunk.split(";")[0].split(" ")[0]
    statementProp = findProperty(chunk.split(";")[2].split(" ")[2].split(":")[1])
    #print(statementProp)
    #print(chunk))
    statementValue = chunk.split(";")[2].split(" ")[3]
    if statementValue.startswith("ed:"):
        statementValue = findQID(statementValue.split(":")[1])
    #print(f"{statementProp} {statementValue}")
    for statement in statementList:
        if statement["object"] == statementID:
            statement["object"] = statementValue

# Processes statements that start with S:
# def processStatementNode(nodeID):
#    statementProp = findProperty(chunk.split(";")[2].split(" ")[2].split(":")[1])
#    #print(statementProp)
#    #print(chunk))
#    statementValue = chunk.split(";")[2].split(" ")[3]
#    if statementValue.startswith("ed:"):
#        statementValue = findQID(statementValue.split(":")[1])
#    #print(f"{statementProp} {statementValue}")
#    for statement in statementList:
#        if statement["object"] == statementID:
#            statement["object"] = statementValue


# Processes references (not implemented yet)
#def processRef(chunk):
#   #print(chunk)


# Processes chunks based on their statement type
def processChunk(chunk):
    chunk = chunk.replace("\n", " ").replace("\t", " ")
    if (
        chunk.startswith(" <https://")
        | chunk.startswith(" edata:")
        | chunk.startswith("@prefix")
        | chunk.startswith("xsd:")
        | chunk.startswith("ontolex:")
        | chunk.startswith("dct:")
        | chunk.startswith("rdfs:")
        | chunk.startswith("owl:")
        | chunk.startswith("skos:")
        | chunk.startswith("schema:")
        | chunk.startswith("cc:")
        | chunk.startswith("geo:")
        | chunk.startswith("prov:")
        | chunk.startswith("data")
        | chunk.startswith("<")
    ):
        pass
    else:
        chunk = chunk.lstrip()
        output.write(chunk + "\n")
        if chunk.startswith("ed:Q"):
            processED_Q(chunk)
        if chunk.startswith("edt:P"):
            processEDT_P(chunk)
        if chunk.startswith("eds:"):
            statementNodes.append(chunk)
            processS(chunk)
        if chunk.startswith("edref:"):
            referenceNodes.append(chunk)
        if chunk.startswith("edv:"):
            valueNodes.append(chunk)
        else:
            pass

ttl_files_dir = "../EnslavedTTL/"
# Processes each file in the directory, line by line, and writes the results to a TSV file
for filename in tqdm.tqdm((os.listdir(ttl_files_dir))):
#for filename in os.listdir(ttl_files_dir):
    statementList = []
    chunkList = []
    statementNodes = []
    referenceNodes = []
    valueNodes = []

    try:
        if filename.endswith(".ttl"):
            file_path = os.path.join(ttl_files_dir, filename)
            with open(file_path, "r") as file:
                chunk = ""
                name = filename.split(".")[0].replace(" ", "_")
                output = open(f"../EnslavedTSV/Chunks/{name}-chunks.txt", "a")

                # Iterate over each line in the file
                for line in file:
                    # Append the line to the current chunk
                    chunk += line

                    # Check if the line ends with a '.'
                    if line.strip().endswith("."):
                        # Process the chunk
                        processChunk(chunk)
                        # Reset the chunk
                        chunk = ""

            with open(f"../EnslavedTSV/{name}.tsv","w") as tsvFile:
                name = statementList[0]["subject"]
                for statement in statementList:
                    #print(statement)
                    if statement["subject"] == name:
                        tsvFile.write(
                            f"{statement['subject']}\t{statement['predicate']}\t{statement['object']}\n"
                        )
    except Exception as e:
        ic(f"Error with {filename}: {e}")
        errors.append(f"Error with {filename}: {e}")
        pass
        
        output.close()
        tsvFile.close()
        file.close()
    #for node in statementNodes:
    #   #print(node)

#with open("properties.txt", "w") as propOutput:
#    for prop in sorted(props):
#        propOutput.write(f"{prop}\t{props[prop]}\t{PropsCount[prop]}\n")
#    propOutput.close()
    
with open("QIDs.txt", "w") as qidOutput:
    for qid in sorted(foundQIDs):
        qidOutput.write(f"{qid}\t{foundQIDs[qid]}\t{QIDCount[qid]}\n")
    qidOutput.close()
    
with open("errors.txt", "w") as file:
    file.write(f"{filename} Errors:" + "*"*50 + "\n\n")
    for error in errors:
        file.write(error + "\n")
    file.write("\n\n")
    file.close()

  0%|          | 0/52 [00:00<?, ?it/s]

ic| f"Error with {filename}: {e}": 'Error with Prince_Estabrook.ttl: list index out of range'
100%|██████████| 52/52 [00:20<00:00,  2.54it/s]
