## Making TSVs from ~200 individuals that are both in Enslaved and Wikidata.

### Each TSV should contain the triples from Wikidata as:
Subject ->  Predicate   -> Object


In [None]:
#install Pandas
%pip install pandas
%pip install rdfpandas

In [None]:
# Open the file for reading
import json
from wikidata.client import Client
import tqdm
import os

# Dictionary to store found QIDs to avoid redundant API calls
foundQIDs = {}

# Stores any errors to then print out at the end
errors = []


# Queries WikidataProps.json to find the property label
def findProperty(str):
    with open("WikidataProps.json") as file:
        properties = json.load(file)
    return properties.get(str)


# Queries foundQIDs, if not found, queries the Wikidata API and stores the result in foundQIDs
def findQID(str):
    if str in foundQIDs:
        print(f"Found {str}: {foundQIDs[str]}")
        return foundQIDs[str]
    else:
        client = Client()
        entity = client.get(str, load=True)
        foundQIDs.update({str: entity.label})

        return entity.label


# Processes statements that start wit WDQ:
def processWDQ(chunk):
    subject = chunk.split(" ")[0]
    subject = findQID(subject.split(":")[1])
    predicate = chunk.split(" ")[1]
    if predicate.startswith("p:") | predicate.startswith("wdt:"):
        predicate = findProperty(predicate.split(":")[1])

    object = chunk.split(" ")[2]

    new_row = {"subject": subject, "predicate": predicate, "object": object}
    statementList.append(new_row)


# Processes statements that start with S:
def processS(chunk):
    statementID = chunk.split(";")[0].split(" ")[0]
    statementProp = findProperty(chunk.split(";")[2].split(" ")[2].split(":")[1])
    # print(statementProp)
    # print(chunk))
    statementValue = chunk.split(";")[2].split(" ")[3]
    if statementValue.startswith("wd:"):
        statementValue = findQID(statementValue.split(":")[1])
    # print(f"{statementProp} {statementValue}")
    for statement in statementList:
        if statement["object"] == statementID:
            statement["object"] = statementValue


# Processes references (not implemented yet)
def processRef(chunk):
    print(chunk)


# Processes chunks based on their statement type
def processChunk(chunk):
    chunk = chunk.replace("\n", " ").replace("\t", " ")
    if (
        chunk.startswith(" <https://")
        | chunk.startswith(" data:")
        | chunk.startswith("@prefix")
        | chunk.startswith("xsd:")
        | chunk.startswith("ontolex:")
        | chunk.startswith("dct:")
        | chunk.startswith("rdfs:")
        | chunk.startswith("owl:")
        | chunk.startswith("skos:")
        | chunk.startswith("schema:")
        | chunk.startswith("cc:")
        | chunk.startswith("geo:")
        | chunk.startswith("prov:")
        | chunk.startswith("data")
        | chunk.startswith("<")
    ):
        pass
    else:
        chunk = chunk.lstrip()
        # print(chunk)
        if chunk.startswith("wd:Q"):
            processWDQ(chunk)
        if chunk.startswith("wdt:P"):
            processWDQ(chunk)
        if chunk.startswith("s:"):
            processS(chunk)
        # if chunk.startswith('ref:'):
        #    processRef(chunk)

ttl_files_dir = "./TTLFiles/"

# Processes each file in the directory, line by line, and writes the results to a TSV file
for filename in tqdm.tqdm((os.listdir(ttl_files_dir))):
    statementList = []
    try:
        if filename.endswith(".ttl"):
            file_path = os.path.join(ttl_files_dir, filename)
            with open(file_path, "r") as file:
                chunk = ""

                # Iterate over each line in the file
                for line in file:
                    # Append the line to the current chunk
                    chunk += line

                    # Check if the line ends with a '.'
                    if line.strip().endswith("."):
                        # Process the chunk
                        processChunk(chunk)
                        # Reset the chunk
                        chunk = ""
            name = filename.split(".")[0].replace(" ", "_")
            with open(f"./TSVFiles/{name}.tsv", "w") as file:
                name = statementList[0]["subject"]
                for statement in statementList:
                    if statement["subject"] == name:
                        file.write(
                            f"{statement['subject']}\t{statement['predicate']}\t{statement['object']}\n"
                        )

    except Exception as e:
        errors.append(f"{filename} {e}")
        continue

# Prints errors
for error in errors:
    print(error)