# INSTRUCTIONS:
1) Generate tuples using AllenNLP

2) Call create_id_files

3) Run Train_TransE

4) Call reduce_relations

In [1]:
import numpy as np

In [233]:
def get_ids(idfile):
    with open(idfile, 'r') as infile:
        lines = infile.read().split('\n')
        ilist = [''] * len(lines)
        for pair in lines:
            p = pair.split('\t')
            ilist[int(p[1])] = p[0]
    return ilist
    
def get_vectors(vecfile):
    with open(vecfile, 'r') as infile:
        lines = infile.read().split('\n')
        vlist = [np.array([float(s) for s in vec.split('\t') if len(s) > 0]) for vec in lines]
    return np.array(vlist)
        

def combine_relations(R, thresh):
    """
    R is a (M, N) matrix where M is the number of relations and N is the size of a relation
    Returns a list of K indices and a (K, N) matrix where K is the new number of relations 
    which have been combined based on the cosine similarity threshold
    """
    still_combining = True
    combined = np.copy(R)
    indices = list(range(len(R)))
    newindices = list(range(len(R)))
    while still_combining:
        still_combining = False
        for i in range(len(combined)):
            for j in range(len(combined)-1,i,-1):
                cos_sim = np.dot(combined[i,:], combined[j,:]) / (np.linalg.norm(combined[i,:]) * np.linalg.norm(combined[j,:]))
                if cos_sim > thresh:
                    still_combining = True
                    combined = np.delete(combined, j, 0)
                    newindices[indices.pop(j)] = i
    return newindices, combined

def reduce_relations(ridfile, rvecfile, thresh):
    rids = get_ids(ridfile)
    rvecs = get_vectors(rvecfile)
    newids, rvecs = combine_relations(rvecs, thresh)
    return [rids[i] for i in newids]
    

In [215]:
a = np.array([[1,0,0],[1,0,0],[0,1,0]])
combine_relations(a, 0.5)

([0, 0, 2], array([[1, 0, 0],
        [0, 1, 0]]))

In [230]:
list(zip(reduce_relations('relation2id.txt', 'relation2vec.csv', 0.22), get_ids('relation2id.txt')))

[('draft and was selected', 'draft and was selected'),
 ('continued to be', 'continued to be'),
 ('was named', 'was named'),
 ('led', 'led'),
 ('won', 'won'),
 ('are tied', 'are tied'),
 ('led', 'dropped'),
 ('earning', 'earning'),
 ('was recognized', 'was recognized'),
 ('to start', 'to start'),
 ('suffered', 'suffered'),
 ('to reach', 'to reach'),
 ('was recognized', 'was awarded'),
 ('was recognized', 'was chosen'),
 ('traded', 'traded'),
 ('was settled', 'was settled'),
 ('earning', 'announced'),
 ('is', 'is'),
 ('dropped', 'enjoyed'),
 ('would retire', 'would retire'),
 ('played', 'played'),
 ('suffered', 'lost'),
 ('by winning', 'by winning'),
 ('he would continue to be selected', 'he would continue to be selected'),
 ('losing', 'losing'),
 ('pick', 'pick'),
 ('continued to be', 'declared'),
 ('was named', 'ending'),
 ('was traded', 'was traded'),
 ('entered', 'entered'),
 ('enjoyed', 'leading'),
 ('earned', 'earned'),
 ('draft and was selected', 'surpassed'),
 ('scored', 'scored

In [232]:
def create_id_files(triplefile, relationfile, entityfile):
    entities = set()
    relations = set()
    with open(triplefile, 'r') as infile:
        for line in infile.read().split('\n'):
            if line == '':
                continue
            triple = line.split('\t')
            entities.add(triple[0])
            entities.add(triple[1])
            relations.add(triple[2])
    entities = list(entities)
    relations = list(relations)
    with open(relationfile, 'w') as outfile:
        outfile.write('\n'.join([relations[i] + '\t' + str(i) for i in range(len(relations))]))
    with open(entityfile, 'w') as outfile:
        outfile.write('\n'.join([entities[i] + "\t" + str(i) for i in range(len(entities))]))

In [190]:
create_id_files('relation_tuples.txt', 'relation2id.txt', 'entity2id.txt')

In [234]:
def to_json(eidfile, triplefile, jsonfile):
    jsondict = {"nodes": [], "links": []}
    entities = get_ids(eidfile)
    
    for i in range(len(entities)):
        jsondict["nodes"].append({"id": i, "name": entities[i]})
        
    with open(triplefile, 'r') as infile:
        for line in infile.read().split('\n'):
            if line == '':
                continue
            triple = line.split('\t')
            jsondict["links"].append({"source": entities.index(triple[0]),"target": entities.index(triple[1]),"name":triple[2]})
    
    return jsondict

In [236]:
to_json('entity2id.txt', 'relation extraction/relation_tuples.txt', 'viz/relations.json')

{'nodes': [{'id': 0, 'name': 'Jerry_West'},
  {'id': 1, 'name': 'among_the_top_players_in_the_league'},
  {'id': 2, 'name': 'the_Finals_MVP_Award'},
  {'id': 3,
   'name': "third_on_the_league_'s_all_-_time_regular_season_scoring_and_fourth_on_the_all_-_time_postseason_scoring_list"},
  {'id': 4, 'name': 'the'},
  {'id': 5, 'name': 'by_the_Charlotte_Hornets'},
  {'id': 6, 'name': "the_regular_season_'s_Most_Valuable_Player_Award"},
  {'id': 7, 'name': 'a_torn_Achilles'},
  {'id': 8, 'name': 'the_charges'},
  {'id': 9, 'name': 'to_start_the_All_-_Star_Game'},
  {'id': 10, 'name': 'He'},
  {'id': 11, 'name': 'a_torn_Achilles_tendon'},
  {'id': 12, 'name': 'he'},
  {'id': 13, 'name': 'gold_medals'},
  {'id': 14, 'name': 'the_1997_Slam_Dunk_Contest'},
  {'id': 15, 'name': 'in_the_2008_NBA_Finals'},
  {'id': 16, 'name': 'eventually'},
  {'id': 17, 'name': 'his_play'},
  {'id': 18, 'name': "O'Neal"},
  {'id': 19, 'name': 'the_Academy_Award'},
  {'id': 20, 'name': 'scorer'},
  {'id': 21, 'nam