# INSTRUCTIONS:
1) Generate tuples using AllenNLP

2) Call create_id_files

3) Run Train_TransE

4) Call reduce_relations

In [9]:
import numpy as np
import json
import matplotlib.pyplot as plt

In [10]:
def get_ids(idfile):
    with open(idfile, 'r') as infile:
        lines = infile.read().split('\n')
        ilist = [''] * len(lines)
        for pair in lines:
            p = pair.split('\t')
            ilist[int(p[1])] = p[0]
    return ilist
    
def get_vectors(vecfile):
    with open(vecfile, 'r') as infile:
        lines = infile.read().split('\n')
        vlist = [np.array([float(s) for s in vec.split('\t') if len(s) > 0]) for vec in lines]
    return np.array(vlist)

def combine_relations(R, thresh):
    """
    R is a (M, N) matrix where M is the number of relations and N is the size of a relation
    Returns a list of K indices and a (K, N) matrix where K is the new number of relations 
    which have been combined based on the cosine similarity threshold
    """
    still_combining = True
    combined = np.copy(R)
    indices = list(range(len(R)))
    newindices = list(range(len(R)))
    while still_combining:
        still_combining = False
        for i in range(len(combined)):
            for j in range(len(combined)-1,i,-1):
                cos_sim = np.dot(combined[i,:], combined[j,:]) / (np.linalg.norm(combined[i,:]) * np.linalg.norm(combined[j,:]))
                if cos_sim > thresh:
                    still_combining = True
                    combined = np.delete(combined, j, 0)
                    newindices[indices.pop(j)] = i
    return newindices, combined

def reduce_relations(ridfile, rvecfile, reducefile, thresh):
    rids = get_ids(ridfile)
    rvecs = get_vectors(rvecfile)
    newids, rvecs = combine_relations(rvecs, thresh)
    with open(reducefile, "w") as outfile:
      for i in newids:
        outfile.write("%s\t%s\n" %(rids[i], i))
    return [rids[i] for i in newids]


In [11]:
a = np.array([[1,0,0],[1,0,0],[0,1,0]])
combine_relations(a, 0.5)

([0, 0, 2], array([[1, 0, 0],
        [0, 1, 0]]))

In [13]:
list(zip(reduce_relations('data/relation2id.txt', 'data/relation2vec.csv', 'data/combined_relations.txt', 0.22), get_ids('data/relation2id.txt')))

FileNotFoundError: [Errno 2] No such file or directory: 'data/relation2vec.csv'

In [5]:
def create_id_files(triplefile, relationfile, entityfile):
    entities = set()
    relations = set()
    with open(triplefile, 'r') as infile:
        for line in infile.read().split('\n'):
            if line == '':
                continue
            triple = line.split('\t')
            entities.add(triple[0])
            entities.add(triple[1])
            relations.add(triple[2])
    entities = list(entities)
    relations = list(relations)
    with open(relationfile, 'w') as outfile:
        outfile.write('\n'.join([relations[i] + '\t' + str(i) for i in range(len(relations))]))
    with open(entityfile, 'w') as outfile:
        outfile.write('\n'.join([entities[i] + "\t" + str(i) for i in range(len(entities))]))

In [6]:
create_id_files('data/relation_tuples.txt', 'data/relation2id.txt', 'data/entity2id.txt')

In [7]:
def to_json(eidfile, triplefile, jsonfile):
    jsondict = {"nodes": [], "links": []}
    entities = get_ids(eidfile)
    
    for i in range(len(entities)):
        jsondict["nodes"].append({"id": i, "name": entities[i]})
        
    with open(triplefile, 'r') as infile:
        for line in infile.read().split('\n'):
            if line == '':
                continue
            triple = line.split('\t')
            jsondict["links"].append({"source": entities.index(triple[0]),"target": entities.index(triple[1]),"name":triple[2]})
    
    with open(jsonfile, 'w') as outfile:
        outfile.write(json.dumps(jsondict))

In [8]:
#to_json('data/entity2id.txt', 'data/relation_tuples.txt', 'viz/relations.json')

In [None]:
def heat_map():
    # sphinx_gallery_thumbnail_number = 2
    vegetables = ["cucumber", "tomato", "lettuce", "asparagus",
                  "potato", "wheat", "barley"]
    farmers = ["Farmer Joe", "Upland Bros.", "Smith Gardening",
               "Agrifun", "Organiculture", "BioGoods Ltd.", "Cornylee Corp."]

    harvest = np.array([[0.8, 2.4, 2.5, 3.9, 0.0, 4.0, 0.0],
                        [2.4, 0.0, 4.0, 1.0, 2.7, 0.0, 0.0],
                        [1.1, 2.4, 0.8, 4.3, 1.9, 4.4, 0.0],
                        [0.6, 0.0, 0.3, 0.0, 3.1, 0.0, 0.0],
                        [0.7, 1.7, 0.6, 2.6, 2.2, 6.2, 0.0],
                        [1.3, 1.2, 0.0, 0.0, 0.0, 3.2, 5.1],
                        [0.1, 2.0, 0.0, 1.4, 0.0, 1.9, 6.3]])


    fig, ax = plt.subplots()
    im = ax.imshow(harvest)

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(farmers)))
    ax.set_yticks(np.arange(len(vegetables)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(farmers)
    ax.set_yticklabels(vegetables)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(vegetables)):
        for j in range(len(farmers)):
            text = ax.text(j, i, harvest[i, j],
                           ha="center", va="center", color="w")

    ax.set_title("Harvest of local farmers (in tons/year)")
    fig.tight_layout()
    plt.show()