# Loading word embeddings

In [1]:
import io
import numpy as np
import pandas as pd

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

# en-es.0-5000.txt     en-ru.0-5000.txt     fr-es.0-5000.txt     pt-de.0-5000.txt
# bn-en.5000-6500.txt               en-es.5000-6500.txt

def load_dict(dict_path="data/crosslingual/dictionaries/en-es.0-5000.txt"):
    
    return pd.read_csv(dict_path, names=["src", "tgt"], delim_whitespace=True)


def multi_key_dict(words, dict_):
    out = []
    for word in words:
        if word in dict_:
            out.append(dict_[word])
    return np.asarray(out)



In [3]:
en_es_dict = load_dict()
en_zh_dict = load_dict("data/crosslingual/dictionaries/en-zh.0-5000.txt")

In [5]:
src_path = 'data/wiki.en.vec'
tgt_path = 'data/wiki.es.vec'
tgt_path2 = 'data/wiki.zh.vec'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)
tgt_embeddings2, tgt_id2word2, tgt_word2id2 = load_vec(tgt_path2, nmax)

In [6]:
src, tgt, tgt2, = en_es_dict["src"].values, en_es_dict["tgt"].values, en_zh_dict["tgt"].values

In [7]:
src_ids = multi_key_dict(src, src_word2id)
tgt_ids = multi_key_dict(tgt, tgt_word2id)
tgt_ids2 = multi_key_dict(tgt2, tgt_word2id2)

In [10]:
X , Y, Z = src_embeddings[src_ids, :], tgt_embeddings[tgt_ids,:], tgt_embeddings2[tgt_ids2,:]
X.shape, Y.shape, Z.shape

((11977, 300), (11374, 300), (8225, 300))

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from ripser import Rips
from persim import bottleneck

In [12]:
dgrms = dict()
dsets = [X, Y, Z]
i=0
for language in ['English', 'Spanish', 'Mandarin']:
    rips = Rips(maxdim=2)
    data = dsets[i][:500,:]
    diagrams = rips.fit_transform(data, metric="cosine")
    dgrms[language] = diagrams
    i += 1

Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)


In [3]:
class Tree:
    def __init__(self, data, left=None, right=None):
        self.left = left 
        self.right = right
        self.data = data
    def __str__(self):
        return "Data: %s" % self.data

    @staticmethod
    def node_list(root):
        nlist = []
        queue = [root]
        while len(queue) > 0:
            node = queue.pop()
            if node.left:
                queue.append(node.left)
            if node.right:
                queue.append(node.right)
            else:
                nlist.append(node)
        return nlist
    
    def display(self, keys):
        lines, *_ = self._display_aux(keys)
        for line in lines:
            print(line)

    def _display_aux(self, keys):
        """Returns list of strings, width, height, and horizontal coordinate of the root."""
        # No child.
        if self.right is None and self.left is None:
            line = '%s' % keys[self.data]
            width = len(line)
            height = 1
            middle = width // 2
            return [line], width, height, middle

        # Only left child.
        if self.right is None:
            lines, n, p, x = self.left._display_aux()
            s = '%s' % keys[self.data]
            u = len(s)
            first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s
            second_line = x * ' ' + '/' + (n - x - 1 + u) * ' '
            shifted_lines = [line + u * ' ' for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, n + u // 2

        # Only right child.
        if self.left is None:
            lines, n, p, x = self.right._display_aux()
            s = '%s' % keys[self.data]
            u = len(s)
            first_line = s + x * '_' + (n - x) * ' '
            second_line = (u + x) * ' ' + '\\' + (n - x - 1) * ' '
            shifted_lines = [u * ' ' + line for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, u // 2

        # Two children.
        left, n, p, x = self.left._display_aux()
        right, m, q, y = self.right._display_aux()
        s = '%s' % keys[self.data]
        u = len(s)
        first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s + y * '_' + (m - y) * ' '
        second_line = x * ' ' + '/' + (n - x - 1 + u + y) * ' ' + '\\' + (m - y - 1) * ' '
        if p < q:
            left += [n * ' '] * (q - p)
        elif q < p:
            right += [m * ' '] * (p - q)
        zipped_lines = zip(left, right)
        lines = [first_line, second_line] + [a + u * ' ' + b for a, b in zipped_lines]
        return lines, n + m + u, max(p, q) + 2, n + u // 2


def distance_matrix(nodes, tiny_dmatrix, dgrms):
    """
    Computes distance matrix with complete linkage 
    """
    d_matrix = np.full((len(nodes), len(nodes)), np.nan)
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if isinstance(nodes[i], Tree) and not isinstance(nodes[j], Tree):
                nlist = Tree.node_list(nodes[i])
                ids = [node.data for node in nlist]
                d_matrix[i,j] = max([[tiny_dmatrix[i,j] for i in ids] for j in ids])
            elif isinstance(nodes[j], Tree) and not isinstance(nodes[i], Tree): 
                nlist = Tree.node_list(nodes[j])
                ids = [node.data for node in nlist]
                d_matrix[i,j] = max([[tiny_dmatrix[i,j] for i in ids] for j in ids])
            elif isinstance(nodes[i], Tree) and isinstance(nodes[j], Tree):
                nlist_1 = Tree.node_list(nodes[i])
                nlist_2 = Tree.node_list(nodes[j])
                ids_1 = [node.data for node in nlist_1]
                ids_2 = [node.data for node in nlist_2]
                d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids_1 for j in ids_2])
            else: 
                d_matrix[i,j] = bottleneck(dgrms[nodes[i].data], dgrms[nodes[j].data])
            d_matrix[j,i] = d_matrix[i,j]
        return d_matrix
    
def hclustering(dgrms, homology=0):
    nodes = [Tree(i) for i in range(len(dgrms))]
    new_dgrms = [dgrms[i][homology] for i in dgrms]
    tiny_dmatrix = np.full((len(nodes), len(nodes)), np.nan)                
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            tiny_dmatrix[i,j] = bottleneck(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
    while len(nodes) > 1:
        a =[print(node) for node in nodes]
        d_matrix = distance_matrix(nodes, tiny_dmatrix, new_dgrms)
        i, j = np.unravel_index(np.argmin(d_matrix), d_matrix.shape)
        node = Tree(None, left=nodes.pop(i), right=nodes.pop(j))
        nodes.append(node)        
    return nodes[0]
    

In [59]:
q = hclustering(dgrms,homology=1)

Data: 0, Left Node: None, Right Node: None

Data: 1, Left Node: None, Right Node: None

Data: 2, Left Node: None, Right Node: None

Data: 2, Left Node: None, Right Node: None

Data: None, Left Node: Data: 0, Left Node: None, Right Node: None
, Right Node: Data: 1, Left Node: None, Right Node: None




In [70]:
q.display(dgrms.keys())

dict_keys(['English', 'Spanish', 'Mandarin'])