# Random Walk + Topic Modeling (Node Only)

In [1]:
import rdflib
import xlrd
import pandas
import random
from rdflib import URIRef, Literal, BNode, Namespace

data = { # similar format for how you made your pandas data fram
    "subject": [],
    "predicate": [],
    "object" : []
}
wb = xlrd.open_workbook("ATTWN triples.xlsx")
sheet = wb.sheet_by_index(0)
for i in range(1, sheet.nrows): # skip the first header line
       
    s = sheet.cell_value(i, 0)
    p = sheet.cell_value(i, 1)
    o = sheet.cell_value(i, 2)
    
    data["subject"].append(s)
    data["predicate"].append(p)
    #print(o)
    data["object"].append(o) 
  
data_processed = {
    'subject':[],'predicate':[],'object': []
}
ugly_token = {
    ' ': '_',
    '"': '',
}
# replace all ugly tokens and copy to new data structure
for x in data.keys(): #x is subj,obj,pred
    for item in data[x]:
        if type(item)!=str:
            data_processed[x].append(item)
            continue
        new_token=item       
        for k in ugly_token:
            new_token=new_token.replace(k, ugly_token[k]) 
            
        data_processed[x].append(new_token)            

n = Namespace("http://UCLA_REU_2020.org/ATTWN/")

g = rdflib.Graph()

for i in range(len(data['subject'])):
    s = n[data_processed['subject'][i]]
    p = n[data_processed['predicate'][i]]
    o_data=data_processed['object'][i]
    if type(o_data)==float or type(o_data)==int:
        o_node=Literal(o_data)
    else:
        o_node=n[o_data]
    g.add((s, p, o_node))

#check g
#for s, p, o in g:
#  print((s, p, o))

In [2]:
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib qt
#%matplotlib inline
plt.figure(figsize=(20,20))

edgelabels={}
G = nx.Graph()
#plt.clf()
for i in range(len(data['object'])):
    v1 = data['subject'][i]
    v2 = data['object'][i]
    G.add_edge(v1,v2)
    e_lbl = data['predicate'][i]
    edgelabels[(v1, v2)] = e_lbl

#print(edge_labels)

pos = nx.spring_layout(G,k=0.15,iterations=20, scale=3)
nx.draw_networkx(G, pos=pos,font_size=8,node_color='pink')
nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edgelabels, font_size=7)

plt.show()

  if cb.is_numlike(alpha):


In [3]:

def stripURI(x):
    return x.split("/")[-1]
def random_walk_nondir(g, walk_length):
    '''
        Performs random walk over rdflib graph. Does not pay attention to direction of the knowledge graph.
        Usually in KG it is directed as subj-pred-obj
        Returns 2-tuple of lists (nodes_traversed, edges_traversed)
        g::rdflib graph
        walk_length::positive_int
    '''
    subjs = list(g.subjects())
    currentNode = random.choice(subjs)
    nodes_traversed = [stripURI(currentNode)]
    edges_traversed = []
    for i in range(walk_length):
        # get the possible nodes
        preds_objs = list(g.predicate_objects(subject = currentNode)) # (edge, node)
        subj_preds = list(g.subject_predicates(object = currentNode)) # (node, edge)
        # combine the two lists
        # list of (node, edge) tuples
        nodes_edges = [(t[0], t[1]) for t in subj_preds] + [(t[1], t[0]) for t in preds_objs]
        if len(nodes_edges) == 0: # nowhere to go, end walk
            break
        node, edge = random.choice(nodes_edges)
        nodes_traversed.append(stripURI(node))
        edges_traversed.append(stripURI(edge))
        currentNode = node
    return (nodes_traversed, edges_traversed)


In [26]:
#Here we consider the case where the random walks only extract the nodes
cleaned_data = []
for i in range(1000):
    rw_nodes, rw_edges = random_walk_nondir(g, 50)
    lmao = ' '.join(word for word in rw_nodes)
    cleaned_data.append(lmao)

In [27]:
#Tf-idf with NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

count_vectorizer = TfidfVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(cleaned_data)
idx_to_word = np.array(count_vectorizer.get_feature_names())

from sklearn.decomposition import NMF
nmf = NMF(n_components=5, solver="mu")
H = nmf.fit_transform(count_data)
W = nmf.components_
 
# print the topics
 
for i, topic in enumerate(W):
 
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: john_macarthur,ethel_rogers,thomas_rogers,lawrence_wargrave,emily_brent,william_blore,philip_lombard,vera_claythorne,edward_armstrong,past_crime
Topic 2: boy,lieutenant,defendant,suspect,employer,patient,maid,two_children,past_crime,had_killed
Topic 3: hatchetwound,sleep,crushed,shove,hanging,gunshot,blow_to_head,injection,drowning,died_by
Topic 4: tenth_soldier,bee_sting,sixth_soldier,big_bear_hugged_one,seventh_soldier,fourth_soldier,third_soldier,fifth_soldier,eighth_soldier,poem
Topic 5: pills,patient,check,lawrence_wargrave,poison,ten_little_soldiers,overslept,choking,first_soldier,second_soldier


In [28]:
#print out the topic clustering for only-node randomwalk
%matplotlib qt
color_map = []
topic_groups = []
for i, topic in enumerate(W):
    temp = [entity for entity in idx_to_word[topic.argsort()[-10:]]]
    topic_groups.append(temp)

def formatchange(word):
    word = str(word)
    word = word.lower()
    word = word.replace(" ", "_")
    return word

color_map = []
for node in G:
    if formatchange(node) in topic_groups[0]:
        color_map.append('blue')
    elif formatchange(node) in topic_groups[1]:
        color_map.append('green')
    elif formatchange(node) in topic_groups[2]: 
        color_map.append('yellow') 
    elif formatchange(node) in topic_groups[3]: 
        color_map.append('red') 
    elif formatchange(node) in topic_groups[4]:
        color_map.append('orange')
    else: 
        color_map.append('black')
pos = nx.spring_layout(G,k=0.15,iterations=20, scale=3)
nx.draw(G, pos=pos,font_size=8, node_color=color_map, with_labels=True)
plt.show()

# Random Walk + Topic Modeling (Including Predicates)

In [None]:
import rdflib
import xlrd
import pandas
import random
from rdflib import URIRef, Literal, BNode,Namespace

data = { # similar format for how you made your pandas data fram
    "subject": [],
    "predicate": [],
    "object" : []
}
wb = xlrd.open_workbook("ATTWN triples.xlsx")
sheet = wb.sheet_by_index(0)
for i in range(1, sheet.nrows): # skip the first header line
       
    s = sheet.cell_value(i, 0)
    p = sheet.cell_value(i, 1)
    o = sheet.cell_value(i, 2)
    
    data["subject"].append(s)
    data["predicate"].append(p)
    #print(o)
    data["object"].append(o) 
  
data_processed = {
    'subject':[],'predicate':[],'object': []
}
ugly_token = {
    ' ': '_',
    '"': '',
}
# replace all ugly tokens and copy to new data structure
for x in data.keys(): #x is subj,obj,pred
    for item in data[x]:
        if type(item)!=str:
            data_processed[x].append(item)
            continue
        new_token=item       
        for k in ugly_token:
            new_token=new_token.replace(k, ugly_token[k]) 
            
        data_processed[x].append(new_token)            

n = Namespace("http://UCLA_REU_2020.org/ATTWN/")

g = rdflib.Graph()

for i in range(len(data['subject'])):
    s = n[data_processed['subject'][i]]
    p = n[data_processed['predicate'][i]]
    o_data=data_processed['object'][i]
    if type(o_data)==float or type(o_data)==int:
        o_node=Literal(o_data)
    else:
        o_node=n[o_data]
    g.add((s, p, o_node))

#check g
#for s, p, o in g:
#  print((s, p, o))

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib qt
#%matplotlib inline
plt.figure(figsize=(20,20))

edgelabels={}
G = nx.Graph()
#plt.clf()
for i in range(len(data['object'])):
    v1 = data['subject'][i]
    v2 = data['object'][i]
    G.add_edge(v1,v2)
    e_lbl = data['predicate'][i]
    edgelabels[(v1, v2)] = e_lbl

#print(edge_labels)

pos = nx.spring_layout(G,k=0.15,iterations=20, scale=3)
nx.draw_networkx(G, pos=pos,font_size=8,node_color='pink')
nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edgelabels, font_size=7)

plt.show()

In [None]:
def stripURI(x):
    return x.split("/")[-1]
def random_walk_nondir(g, walk_length):
    '''
        Performs random walk over rdflib graph. Does not pay attention to direction of the knowledge graph.
        Usually in KG it is directed as subj-pred-obj
        Returns 2-tuple of lists (nodes_traversed, edges_traversed)
        g::rdflib graph
        walk_length::positive_int
    '''
    subjs = list(g.subjects())
    currentNode = random.choice(subjs)
    nodes_traversed = [stripURI(currentNode)]
    edges_traversed = []
    for i in range(walk_length):
        # get the possible nodes
        preds_objs = list(g.predicate_objects(subject = currentNode)) # (edge, node)
        subj_preds = list(g.subject_predicates(object = currentNode)) # (node, edge)
        # combine the two lists
        # list of (node, edge) tuples
        nodes_edges = [(t[0], t[1]) for t in subj_preds] + [(t[1], t[0]) for t in preds_objs]
        if len(nodes_edges) == 0: # nowhere to go, end walk
            break
        node, edge = random.choice(nodes_edges)
        nodes_traversed.append(stripURI(node))
        edges_traversed.append(stripURI(edge))
        currentNode = node
    return (nodes_traversed, edges_traversed)


In [29]:
#Here we consider the case where the random walk includes the predicates as well
cleaned_data = []
for i in range(1000):
    rw_nodes, rw_edges = random_walk_nondir(g, 50)
    rw_entity = rw_nodes + rw_edges
    lmao = ' '.join(entity for entity in rw_entity)
    cleaned_data.append(lmao)

In [30]:
#Tf-idf with NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

count_vectorizer = TfidfVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(cleaned_data)
idx_to_word = np.array(count_vectorizer.get_feature_names())

from sklearn.decomposition import NMF
nmf = NMF(n_components=5, solver="mu")
H = nmf.fit_transform(count_data)
W = nmf.components_
 
# print the topics
 
for i, topic in enumerate(W):
 
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: has_setting,philip_lombard,edward_armstrong,vera_claythorne,heard_clue,has_firstname,has_lastname,is_character,has_predicate,found_body_of
Topic 2: hatchetwound,gunshot,injection,sleep,blow_to_head,drowning,has_predicate,died_by,has_time,by_means_of
Topic 3: first_soldier,eighth_soldier,fifth_soldier,sixth_soldier,has_title,found_clue,second_soldier,poem,character_of,death_by
Topic 4: two_children,employer,maid,patient,boy,because_of,past_crime,had_killed_,has_predicate,had_killed
Topic 5: self,preservation,alcoholism,describes,is_named,had_killed_,committed_,escaped_conviction_of,past_crime,because_of


# Semi-Supervised NMF

In [None]:
import rdflib
import xlrd
import pandas
import random
from rdflib import URIRef, Literal, BNode,Namespace

data = { # similar format for how you made your pandas data fram
    "subject": [],
    "predicate": [],
    "object" : []
}
wb = xlrd.open_workbook("ATTWN triples.xlsx")
sheet = wb.sheet_by_index(0)
for i in range(1, sheet.nrows): # skip the first header line
       
    s = sheet.cell_value(i, 0)
    p = sheet.cell_value(i, 1)
    o = sheet.cell_value(i, 2)
    
    data["subject"].append(s)
    data["predicate"].append(p)
    #print(o)
    data["object"].append(o) 
  
data_processed = {
    'subject':[],'predicate':[],'object': []
}
ugly_token = {
    ' ': '_',
    '"': '',
}
# replace all ugly tokens and copy to new data structure
for x in data.keys(): #x is subj,obj,pred
    for item in data[x]:
        if type(item)!=str:
            data_processed[x].append(item)
            continue
        new_token=item       
        for k in ugly_token:
            new_token=new_token.replace(k, ugly_token[k]) 
            
        data_processed[x].append(new_token)            

n = Namespace("http://UCLA_REU_2020.org/ATTWN/")

g = rdflib.Graph()

for i in range(len(data['subject'])):
    s = n[data_processed['subject'][i]]
    p = n[data_processed['predicate'][i]]
    o_data=data_processed['object'][i]
    if type(o_data)==float or type(o_data)==int:
        o_node=Literal(o_data)
    else:
        o_node=n[o_data]
    g.add((s, p, o_node))

#check g
#for s, p, o in g:
#  print((s, p, o))

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib qt
#%matplotlib inline
plt.figure(figsize=(20,20))

edgelabels={}
G = nx.Graph()
#plt.clf()
for i in range(len(data['object'])):
    v1 = data['subject'][i]
    v2 = data['object'][i]
    G.add_edge(v1,v2)
    e_lbl = data['predicate'][i]
    edgelabels[(v1, v2)] = e_lbl

#print(edge_labels)

pos = nx.spring_layout(G,k=0.15,iterations=20, scale=3)
nx.draw_networkx(G, pos=pos,font_size=8,node_color='pink')
nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edgelabels, font_size=7)

plt.show()

In [None]:
def stripURI(x):
    return x.split("/")[-1]
def random_walk_nondir(g, walk_length):
    '''
        Performs random walk over rdflib graph. Does not pay attention to direction of the knowledge graph.
        Usually in KG it is directed as subj-pred-obj
        Returns 2-tuple of lists (nodes_traversed, edges_traversed)
        g::rdflib graph
        walk_length::positive_int
    '''
    subjs = list(g.subjects())
    currentNode = random.choice(subjs)
    nodes_traversed = [stripURI(currentNode)]
    edges_traversed = []
    for i in range(walk_length):
        # get the possible nodes
        preds_objs = list(g.predicate_objects(subject = currentNode)) # (edge, node)
        subj_preds = list(g.subject_predicates(object = currentNode)) # (node, edge)
        # combine the two lists
        # list of (node, edge) tuples
        nodes_edges = [(t[0], t[1]) for t in subj_preds] + [(t[1], t[0]) for t in preds_objs]
        if len(nodes_edges) == 0: # nowhere to go, end walk
            break
        node, edge = random.choice(nodes_edges)
        nodes_traversed.append(stripURI(node))
        edges_traversed.append(stripURI(edge))
        currentNode = node
    return (nodes_traversed, edges_traversed)


In [9]:
#Here we consider the case where the random walks only extract the nodes
cleaned_data = []
for i in range(1000):
    rw_nodes, rw_edges = random_walk_nondir(g, 50)
    lmao = ' '.join(word for word in rw_nodes)
    cleaned_data.append(lmao)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
count_vectorizer = TfidfVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(cleaned_data)
idx_to_word = np.array(count_vectorizer.get_feature_names())
cnpy = count_data.todense()
cnpy = cnpy.transpose()

df = pd.read_excel('ATTWN_labeling.xlsx', header=None)  #This is the label, supervised stuffs 
labeling = df.to_numpy()
label = labeling.transpose()*cnpy

ntopics = 3 #change the variable here
nclasses = 3 #change the variable here


A = np.matrix(np.random.rand(cnpy.shape[0],ntopics))
B = np.matrix(np.random.rand(nclasses,ntopics)) 
S = np.matrix(np.random.rand(ntopics,cnpy.shape[1]))

for i in range(180):   
    A = np.multiply(A,np.divide(cnpy*(S.transpose()),A*S*(S.transpose())))
    B = np.multiply(B,np.divide(label*(S.transpose()),B*S*(S.transpose())))
    S = np.multiply(S,np.divide(((A.transpose())*cnpy + (B.transpose())*label) , ((A.transpose())*A*S + (B.transpose())*B*S)))
    
print(A.shape)
print(type(A))
A = np.array(A)
for i in range(3):
    print("Topic {}: {}".format(i+1, ",".join([str(x) for x in idx_to_word[A[:,i].argsort()[-10:]]])))

(125, 3)
<class 'numpy.matrix'>
Topic 1: ethel_rogers,thomas_rogers,philip_lombard,william_blore,lawrence_wargrave,emily_brent,vera_claythorne,edward_armstrong,had_killed,past_crime
Topic 2: red_herring_swallowed_one,first_soldier,second_soldier,third_soldier,sixth_soldier,fourth_soldier,fifth_soldier,seventh_soldier,eighth_soldier,poem
Topic 3: choking,crushed,blow_to_head,gunshot,injection,hatchetwound,hanging,sleep,drowning,died_by


# Let's try the Community Detection

In [11]:
def edge_to_remove(graph):
  G_dict = nx.edge_betweenness_centrality(graph)
  edge = ()

  # extract the edge with highest edge betweenness centrality score
  for key, value in sorted(G_dict.items(), key=lambda item: item[1], reverse = True):
      edge = key
      break

  return edge

In [12]:
def girvan_newman(graph):
    # find number of connected components
    sg = nx.connected_components(graph)
    sg_count = nx.number_connected_components(graph)

    while(sg_count < 7):
        graph.remove_edge(edge_to_remove(graph)[0], edge_to_remove(graph)[1])
        sg = nx.connected_components(graph)
        sg_count = nx.number_connected_components(graph)

    return sg

In [13]:
# find communities in the graph
c = girvan_newman(G.copy())
# find the nodes forming the communities
node_groups = []

for i in c:
  node_groups.append(list(i))

In [14]:
# plot the communities
%matplotlib qt
color_map = []
for node in G:
    if node in node_groups[0]:
        color_map.append('blue')
    elif node in node_groups[1]: 
        color_map.append('green')
    elif node in node_groups[2]: 
        color_map.append('yellow') 
    elif node in node_groups[3]: 
        color_map.append('red') 
    elif node in node_groups[4]: 
        color_map.append('pink') 
    elif node in node_groups[5]: 
        color_map.append('orange')
    else: 
        color_map.append('purple')

nx.draw(G, node_color=color_map, with_labels=True)
plt.show()

  if cb.is_numlike(alpha):


# Predicates Embedding 

In [15]:
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib qt
#%matplotlib inline
plt.figure(figsize=(20,20))
%matplotlib qt
edgelabels={}
G = nx.DiGraph()
#plt.clf()
for i in range(len(data['object'])):
    v1 = data['subject'][i]
    v2 = data['object'][i]
    G.add_edge(v1,v2)
    e_lbl = data['predicate'][i]
    edgelabels[(v1, v2)] = e_lbl

#print(edge_labels)

pos = nx.spring_layout(G,k=0.15,iterations=20, scale=3)
nx.draw_networkx(G, pos=pos,font_size=8,node_color='pink')
nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edgelabels, font_size=7)

plt.show()
for s, p, o in g:
   print((s, p, o))

(rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Vera_Claythorne'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_profession'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/former_governess'))
(rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Edward_Armstrong'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/found_body_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Lawrence_Wargrave'))
(rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Emily_Brent'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_predicate'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/died_by'))
(rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Lawrence_Wargrave'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/escaped_conviction_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/past_crime'))
(rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Edward_Armstrong'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_predicate'), rdflib.term.URIRef('http:/

In [16]:
#checking possibles predicates
pred = list(g.predicates())
refinedpred = {}
for entry in pred:
    if entry in refinedpred.keys():
        refinedpred[entry] += 1
    else:
        refinedpred[entry] = 1
predicate = list(refinedpred.keys())
strip_predicate  = []
for l in predicate:
    strip_predicate.append(stripURI(l))
print(predicate)
print(strip_predicate)
num_pred = len(strip_predicate)
print(num_pred)

[rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_profession'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/found_body_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_predicate'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/escaped_conviction_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/is_character'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/death_by'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/had_killed_'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/because_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/heard_clue'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_time'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/has_firstname'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/owned_by'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/character_of'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/found_clue'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/invited_by'), r

In [17]:
#checking possibles subjects+obj
sub = list(g.subjects()) + list(g.objects())
refined_ent = {}
for entry in sub:
    if entry in refined_ent.keys():
        refined_ent[entry] += 1
    else:
        refined_ent[entry] = 1
entity = list(refined_ent.keys())
strip_entity  = []
for l in entity:
    strip_entity.append(stripURI(l))
print(entity)
print(strip_entity)
num_entity = len(strip_entity)
print(num_entity)

[rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Vera_Claythorne'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Edward_Armstrong'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Emily_Brent'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Lawrence_Wargrave'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Character'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Philip_Lombard'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Sixth_Soldier'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/past_crime'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Thomas_Rogers'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Anthony_Martson'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/William_Blore'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/died_by'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/Fourth_Soldier'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATTWN/John_Macarthur'), rdflib.term.URIRef('http://UCLA_REU_2020.org/ATT

In [18]:
import numpy as np
from sklearn.preprocessing import normalize

embedding_mat = np.zeros((52,137))

for s,p,o in g:
    indexs = entity.index(s)
    indexp = predicate.index(p)
    indexo = entity.index(o)
    embedding_mat[indexp, indexs] = embedding_mat[indexp,indexs] + 1
    embedding_mat[indexp + 26, indexo] = embedding_mat[indexp + 26, indexo] + 1

predicate_embedding = normalize(embedding_mat, axis = 0, norm= 'l1')

In [19]:
def nnsearch(index, D, K):
    import numpy as np
    consider = D[:,index]
    sqd = np.zeros((1, D.shape[1]))
    for i in range(D.shape[1]):
        s = 0
        for j in range(D.shape[0]):
             s = s + abs(consider[j]-D[j,i])
        sqd[0,i] = s 
    idx = np.argsort(sqd)
    return idx[0,1:K]
## Here, I use the L-1 norm

In [20]:
print(strip_entity[4])

index_4 = nnsearch(4, predicate_embedding, 10)
for sth in index_4:
    print(strip_entity[sth])


Character
Edward_Armstrong
John_Macarthur
Ethel_Rogers
hanging
Thomas_Rogers
choking
Lombard
Ten_Little_Soldiers
reunion


In [21]:
print(strip_entity)

['Vera_Claythorne', 'Edward_Armstrong', 'Emily_Brent', 'Lawrence_Wargrave', 'Character', 'Philip_Lombard', 'Sixth_Soldier', 'past_crime', 'Thomas_Rogers', 'Anthony_Martson', 'William_Blore', 'died_by', 'Fourth_Soldier', 'John_Macarthur', 'hatchetwound', 'Bear-shaped_clock', 'Third_Soldier', 'patient', 'Ninth_Soldier', 'defendant', 'drowning', 'Ethel_Rogers', 'Blow_to_head', 'twentyone_people', 'suspect', 'injection', 'Second_Soldier', 'Tenth_Soldier', 'two_children', 'gunshot', 'Fifth_Soldier', 'had_killed', 'Pills', 'Ninth_Solider', 'Eighth_Soldier', 'sleep', 'hanging', 'Poem', 'Record', 'maid', 'lieutenant', 'Seventh_Soldier', 'Philip_Lombard_', 'First_Soldier', 'crushed', 'Gun', 'choking', 'Syringe', 'boy', 'employer', 'former_governess', 'Anthony_Marston', 'got_in_chancery', 'self-preservation', 'had_killed_', 'record', '10.0', 'Edward_', 'chopped_in_halves', 'Anthony', 'Army_General', 'Philip', 'poem', 'forgotten_acquaintance', 'Indian_Island', 'William', '4.0', 'shove', 'Constanc

In [22]:
print(strip_entity[26])

index_26 = nnsearch(26, predicate_embedding, 10)
for sth in index_26:
    print(strip_entity[sth])

Second_Soldier
First_Soldier
Fifth_Soldier
Third_Soldier
Tenth_Soldier
Eighth_Soldier
Second_Soldier
Seventh_Soldier
Sixth_Soldier
Ninth_Solider


# Miscellaneous

In [None]:
#Countvectorizer + LDA
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(cleaned_data)
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
number_topics = 7
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)


In [None]:
#TF-idf with LDA
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

count_vectorizer = TfidfVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(cleaned_data)
idx_to_word = np.array(count_vectorizer.get_feature_names())
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
number_topics = 7
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
qres = g.query(
    """PREFIX foaf: <http://UCLA_REU_2020.org/ATTWN/>
       SELECT ?obj  
       WHERE {
          foaf:Ethel_Rogers foaf:has_profession ?obj.
       }""")

for row in qres:
    print(row)

In [None]:
qres = g.query(
    """PREFIX foaf: <http://UCLA_REU_2020.org/ATTWN/>
       SELECT ?Firstname
       WHERE {
          ?aname foaf:has_profession foaf:doctor.
          ?aname foaf:has_firstname ?Firstname.
       }""")

for row in qres:
    print(row)

In [None]:
nodes=["Ethel Rogers","past crime","Vera Claythorne","Lawrence Wargrave","Phillip Lombard","Emily Brent","John Macarthur","Edward Armstrong","William Blore"]
g_sub= G.subgraph(nodes)

valid_keys=[(x,y) for x in nodes for y in nodes]
valid_edges = [edge for edge in edgelabels.keys() if any(item == edge for item in valid_keys)]
sub_edgelabels = {key: edgelabels[key] for key in valid_edges}
#print(valid_edges)

pos = nx.spring_layout(g_sub,k=0.15,iterations=20, scale=3)
nx.draw_networkx(g_sub, pos=pos,font_size=8,node_color='pink')
nx.draw_networkx_edge_labels(g_sub, pos=pos, edge_labels=sub_edgelabels, font_size=7)

plt.show()