This file contains code used to transform our knowledge graph into a a form ready to train TransE in OpenKE.

In [3]:
'''
    Utils
'''
def stripURI(x):
    return x.split("/")[-1]

def replaceUgly(df, ugly_tokens):
    pass

def isNumber(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

In [11]:
'''
    Load in the knowledge graph
    create: entity2id, relation2id, train2id
'''

'''
    Create the RDF Triples w/o time or reification
'''
import rdflib
import xlrd
import pandas
from rdflib import URIRef, Literal, BNode,Namespace

data = { # similar format for how you made your pandas data fram
    "subject": [],
    "predicate": [],
    "object" : [],
}
SS_name = "VMars Triples.xlsx"
wb = xlrd.open_workbook(SS_name)

for l in range(wb.nsheets):
    sheet = wb.sheet_by_index(l)
    for i in range(1, sheet.nrows): # skip the first header line
        s = sheet.cell_value(i, 0)
        p = sheet.cell_value(i, 1)
        o = sheet.cell_value(i, 2)

        if not s or not p or not o: # if any are empty then no triple
            continue
            
        data["subject"].append(s)
        data["predicate"].append(p)
        data["object"].append(o)

    
data_processed = {
    'subject':[],'predicate':[],'object': []
}
ugly_token = {
    ' ': '_',
    '"': '',
}

# replace all ugly tokens and copy to new data structure
for x in data.keys(): #x is subj,obj,pred
    for item in data[x]:
        if type(item)!=str:
            data_processed[x].append(item)
            continue
        new_token=item       
        for k in ugly_token:
            new_token = new_token.strip()
            new_token=new_token.replace(k, ugly_token[k]) 
            
        data_processed[x].append(new_token)            

n = Namespace("http://UCLA_REU_2020.org/Veronica_Mars/")

g = rdflib.Graph()
pred_counts = {} # Dictionary mapping each predicate to the number of times they have been used
# used for testing at some point

for i in range(len(data['subject'])):
    s = n[data_processed['subject'][i]]
    p = n[data_processed['predicate'][i]]
    o_data=data_processed['object'][i]
    if type(o_data)==float or type(o_data)==int:
        o_node=Literal(o_data)
    else:
        o_node=n[o_data]
    g.add((s, p, o_node))

In [7]:
# List of all the characters in the show. Characters are selected from this list for selecting testing data.
hars = [
    "Veronica Mars",
    "Weevil Navarro",
    "Wallace Fennel",
    "Celeste Kane",
    "Duncan Kane",
    "Logan Echolls",
    "Lilly Kane",
    "Jake Kane"
    "Keith Mars"
    "Lianne Mars"
    "Troy Vandegraff",
    "Caitlin Ford",
    "Chardo Navarro",
    "Leticia Navarro",
    "Luke Haldeman",
    "Aaron Echolls",
    "Meg Manning",
    "Dick Casablancas",
    "Casey's grandmother",
    "Connor Larkin",
    "Sean Friedrich",
    "Bone Hamilton",
    "Sabrina Fuller",
    "Hamilton Cho",
    "Jim Cho",
    "Mandy",
    "Hans"
]

In [12]:
'''
    Some triples are unwanted so that we can test how transE performs at predicting them
    Currently:
    Remove some proportion of the perpatrators
    Remove some portion of the has_financial_status
'''
from random import randint


def remove_triples(g, all_trips, p):
    '''
        Removes some proportion of a passed list of triples from an rdflib graph.
        The removed triples are output to stdout and the resultant rdflib graph is returned.
        
        g :: rdflib graph of the veronica mars knowledge graph
        all_triples :: a list of all the triples which could be selected for removal for training
        
    '''
    numToRemove = int(p * len(all_trips))
    toRemove = []
    for x in range(numToRemove):
        chosenPerp = randint(0, len(all_trips)-1)
        toRemove.append(all_trips.pop(chosenPerp))
    # output and remove the chosen perps
    for t in toRemove:
        print("{} {} {}".format(stripURI(t[0]),stripURI(t[1]), stripURI(t[2])))
        g.remove(t)
    return g

perps = list(g.triples((n['Perpetrator'], n['described_as'], None))) # 19 perpatrators
fin_stat = list(g.triples((None, n['has_financial_status'], None)))
g = remove_triples(g, perps, 0.5)
g = remove_triples(g, fin_stat, 0.5)

30
Perpetrator described_as Aaron_Echolls
Perpetrator described_as Hans
Perpetrator described_as Sean_Friedrich
Perpetrator described_as Chardo_Navarro
Perpetrator described_as Kimmy
Perpetrator described_as Madison_Sinclair
Perpetrator described_as Catherina_Lenova
Perpetrator described_as Grant_Winters
Perpetrator described_as Pete
Sabrina_Fuller has_financial_status upper_class
Lilly_Kane has_financial_status upper_class
Mandy has_financial_status lower_class
Casey's_grandmother has_financial_status upper_class
Jim_Cho has_financial_status lower_class
Chardo_Navarro has_financial_status lower_class
Duncan_Kane has_financial_status upper_class
Mr_Gant has_financial_status upper_class
Lianne_Mars has_financial_status lower_class
Weevil_Navarro has_financial_status lower_class
Troy_Vandegraff has_financial_status upper_class
Mrs_Gant has_financial_status upper_class
Logan_Echolls has_financial_status upper_class
Celeste_Kane has_financial_status upper_class
Jake_Kane has_financial_stat

In [13]:
'''
    Generates the files required for OpenKE training. refer to OpenKE for the exact format
    entity2id, relation2id, train2id
'''

# map each entity and predicate to a unqiue id. The entity mapping and predicate mapping are disjoint for OpenKE
entity_id = {}
relation_id = {}

all_entities = [stripURI(x) for x in list(set(g.subjects()).union(set(g.objects())))]
all_lines = [] # lines to be written to entity2id.txt
for i in range(len(all_entities)):
    entity = all_entities[i]
    entity_id[entity] = i
    all_lines.append("{} {}".format(entity, i))
all_lines.insert(0, str(len(all_entities)))

all_relations = [stripURI(x) for x in list(set(g.predicates()))]
all_rel_lines = [] # lines to be written to relation2id.txt
for i in range(len(all_relations)):
    rel = all_relations[i]
    relation_id[rel] = i
    all_rel_lines.append("{} {}".format(rel, i))
all_rel_lines.insert(0, str(len(all_rel_lines)))

# Get all triples and save them in the format. For each triple (s,p,o) save them in the format:
# s_id o_id p_id
# with one triple per line
all_triples = [(stripURI(x[0]),stripURI(x[1]),stripURI(x[2])) for x in g.triples((None,None,None))]
all_triple_lines = ["{} {} {}".format(entity_id[t[0]], entity_id[t[2]], relation_id[t[1]]) for t in all_triples]
all_triple_lines.insert(0, str(len(all_triple_lines)))

with open("entity2id.txt", "w") as f:
    f.write('\n'.join(all_lines))
with open("relation2id.txt", "w") as f:
    f.write('\n'.join(all_rel_lines))
with open("train2id.txt", "w") as f:
    f.write('\n'.join(all_triple_lines))
        