# Loading word embeddings

In [1]:
import io
import numpy as np
import pandas as pd

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

def load_dict(dict_path="data/crosslingual/dictionaries/en-es.0-5000.txt"):
    
    return pd.read_csv(dict_path, names=["src", "tgt"], delim_whitespace=True)


def multi_key_dict(words, dict_):
    out = []
    for word in words:
        if word in dict_:
            out.append(dict_[word])
    return np.asarray(out)

In [3]:
embeddings = ['data/wiki.en.vec', 
              'data/wiki.es.vec', 
              'data/wiki.zh.vec', 
              'data/wiki.ko.vec',
              'data/wiki.ru.vec',
              'data/wiki.ja.vec',
              'data/wiki.de.vec',
              'data/wiki.nl.vec',
              'data/wiki.fr.vec',
              'data/wiki.ar.vec',
              'data/wiki.fi.vec',
              'data/wiki.hu.vec']
dictionaries = ['en-en.0-5000.txt',
                'en-es.0-5000.txt',
                'en-zh.0-5000.txt',
                'en-ko.0-5000.txt',
                'en-ru.0-5000.txt',
                'en-ja.0-5000.txt',
                'en-de.0-5000.txt',
                'en-nl.0-5000.txt',
                'en-fr.0-5000.txt',
                'en-ar.0-5000.txt',
                'en-fi.0-5000.txt',
                'en-hu.0-5000.txt']
languages = ['English', 'Spanish', 'Mandarin', 'Korean', 'Russian', 'Japanese', 'German', 'Dutch', 'French', 'Arabic', 'Finnish', 'Hungarian']

In [4]:
nmax = 50000  # maximum number of word embeddings to load
data = dict()
for l_names, path, mydpath in zip(languages, embeddings, dictionaries):
    emb, id2word, word2id = load_vec(path, nmax)
    en_to_x_dict = load_dict("data/crosslingual/dictionaries/" + mydpath)
    src = en_to_x_dict["tgt"].values
    ids = multi_key_dict(src, word2id)
    data[l_names] = emb[ids,:][:200,:]

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from ripser import Rips
from persim import bottleneck, sliced_wasserstein
from scipy.spatial.distance import cosine

In [6]:
def cosine_dist(a,b):
    sim = cosine(a,b)
    return np.arccos(1.0 - sim)/np.pi

In [7]:
dgrms = dict()
for language in languages:
    rips = Rips(maxdim=3)
    print("Language: ", language)
    dgrms[language] = rips.fit_transform(data[language], metric=cosine_dist)

Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  English




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Spanish




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Mandarin




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Korean




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Russian




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Japanese




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  German




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Dutch




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  French




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Arabic




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Finnish




Rips(maxdim=3, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Hungarian




In [8]:
for lang in languages:
    plt.figure()
    rips.plot(dgrms[lang])
    plt.title(lang)
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    plt.savefig('pds_ang_3/' + lang + '.png', dpi=600, bbox_inches = 'tight', pad_inches = 0.2)
    plt.close()

In [39]:
class Tree:
    def __init__(self, data, left=None, right=None):
        self.left = left 
        self.right = right
        self.data = data
    def __str__(self):
        return "Data: %s" % self.data

    @staticmethod
    def node_list(root):
        nlist = []
        queue = [root]
        while len(queue) > 0:
            node = queue.pop()
            if node.left:
                queue.append(node.left)
            if node.right:
                queue.append(node.right)
            else:
                nlist.append(node)
        return nlist
    
    def display(self, keys):
        lines, *_ = self._display_aux(keys)
        for line in lines:
            print(line)

    def _display_aux(self, keys):
        """Returns list of strings, width, height, and horizontal coordinate of the root."""
        # No child.
        if self.right is None and self.left is None:
            line = keys[self.data]
            width = len(line)
            height = 1
            middle = width // 2
            return [line], width, height, middle

        # Only left child.
        if self.right is None:
            lines, n, p, x = self.left._display_aux(keys)
            s = keys[self.data]
            u = len(s)
            first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s
            second_line = x * ' ' + '/' + (n - x - 1 + u) * ' '
            shifted_lines = [line + u * ' ' for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, n + u // 2

        # Only right child.
        if self.left is None:
            lines, n, p, x = self.right._display_aux(keys)
            s = keys[self.data]
            u = len(s)
            first_line = s + x * '_' + (n - x) * ' '
            second_line = (u + x) * ' ' + '\\' + (n - x - 1) * ' '
            shifted_lines = [u * ' ' + line for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, u // 2

        # Two children.
        left, n, p, x = self.left._display_aux(keys)
        right, m, q, y = self.right._display_aux(keys)
        s = keys[self.data]
        u = len(s)+1
        first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s + y * '_' + (m - y) * ' '
        second_line = x * ' ' + '/' + (n - x - 1 + u + y) * ' ' + '\\' + (m - y - 1) * ' '
        if p < q:
            left += [n * ' '] * (q - p)
        elif q < p:
            right += [m * ' '] * (p - q)
        zipped_lines = zip(left, right)
        lines = [first_line, second_line] + [a + u * ' ' + b for a, b in zipped_lines]
        return lines, n + m + u, max(p, q) + 2, n + u // 2


def distance_matrix(nodes, tiny_dmatrix, linkage="complete"):
    """
    Computes distance matrix with complete linkage 
    """
    d_matrix = np.full((len(nodes), len(nodes)), np.nan)
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            nlist_1 = Tree.node_list(nodes[i])
            nlist_2 = Tree.node_list(nodes[j])
            ids_1 = [node.data for node in nlist_1]
            ids_2 = [node.data for node in nlist_2]
            if linkage == "complete":
                d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids_1 for j in ids_2])
            else:
                d_matrix[i,j] = min([tiny_dmatrix[i,j] for i in ids_1 for j in ids_2])
    return d_matrix
    
def hclustering(dgrms, homology=0, dist='sw', linkage="complete"):
    nodes = [Tree(i) for i in range(len(dgrms))]
    if type(homology) is int:
        new_dgrms = [dgrms[i][homology] for i in dgrms]
    elif len(homology) == 2:
        new_dgrms = [np.vstack((dgrms[i][homology[0]], dgrms[i][homology[1]])) for i in dgrms]
    elif len(homology) == 3:
        new_dgrms = [np.vstack((dgrms[i][homology[0]], dgrms[i][homology[1]], dgrms[i][homology[2]])) for i in dgrms]
    elif len(homology) == 4:
        new_dgrms = [np.vstack((dgrms[i][homology[0]], dgrms[i][homology[1]], dgrms[i][homology[2]], dgrms[i][homology[3]])) for i in dgrms]
    tiny_dmatrix = np.full((len(nodes), len(nodes)), np.nan)                
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if dist == 'sw':
                if homology == 0 or (0 in homology):
                    temp_i = np.asarray([x for x in new_dgrms[nodes[i].data] if x[1] != np.inf])
                    temp_j = np.asarray([x for x in new_dgrms[nodes[j].data] if x[1] != np.inf])
                    tiny_dmatrix[i,j] = sliced_wasserstein(temp_i, temp_j)
                else:
                    tiny_dmatrix[i,j] = sliced_wasserstein(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])  
            else:
                tiny_dmatrix[i,j] = bottleneck(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
            tiny_dmatrix[j,i] = tiny_dmatrix[i,j]
    while len(nodes) > 1:
        d_matrix = distance_matrix(nodes, tiny_dmatrix,linkage)
        i, j = np.unravel_index(np.nanargmin(d_matrix), d_matrix.shape)
        print("The minimum is ", d_matrix[i,j])
        node = Tree(-1, left=nodes[i], right=nodes[j])
        nodes = [nodes[k] for k in range(len(nodes)) if k not in [i,j]]
        nodes.append(node)        
    return nodes[0], tiny_dmatrix
    
    

In [10]:
langs = list(dgrms.keys())
langs.append('')

In [40]:
shorter_dgrms = {x:dgrms[x] for x in languages[:-2]}

In [41]:
q1, tiny = hclustering(shorter_dgrms,homology=[0,1,2],dist='sw', linkage="complete");

The minimum is  1.236719309994832
The minimum is  1.8599817414941389
The minimum is  2.34496159486684
The minimum is  2.724530976196973
The minimum is  3.5343840446954444
The minimum is  3.980298343539048
The minimum is  5.6107845415660105
The minimum is  7.617577860481355
The minimum is  28.920650722237983


In [44]:
q1, tiny = hclustering(dgrms,homology=[0,1,2,3],dist='sw', linkage="complete");

The minimum is  1.262962273017806
The minimum is  1.496612227808784
The minimum is  1.904786958614355
The minimum is  2.5695036379562426
The minimum is  2.745530073015718
The minimum is  3.283867735969512
The minimum is  3.624455040314522
The minimum is  4.8713334764250344
The minimum is  5.6826285465295685
The minimum is  7.617577860481355
The minimum is  29.033173526974227


In [468]:
type(shorter_dgrms['English'][0])

numpy.ndarray

In [45]:
q1.display(langs)

                                     ______________________________________________         
                                    /                                               \        
                      ____________________________                              _______    
                     /                             \                            /        \   
       ____________________                ________________               Mandarin Japanese
      /                     \              /                 \                               
    _________            _____         _____          ___________                        
   /          \          /      \       /      \        /            \                       
German     ______    Korean Russian Arabic Finnish Hungarian     _____                     
          /       \                                              /      \                    
       Spanish French                                         English D

In [398]:
def make_latex_tree(tree,langs):
    print(chr(92)+'begin{forest}\n[',end='')
    _make_latex_tree(tree,langs)
    print(']\n'+ chr(92) +'end{forest}',end='')

def _make_latex_tree(tree, langs):
    if not tree.left and not tree.right:
        print(langs[tree.data], end='')
    elif tree.data:
        print('|[', end='')
        _make_latex_tree(tree.left,langs)
        print(']',end='')
        print('[',end='')
        _make_latex_tree(tree.right,langs)
        print(']',end='')    

In [472]:
make_latex_tree(q1,langs)

\begin{forest}
[|[|[|[English][Arabic]][|[|[Spanish][German]][|[Korean][|[Dutch][|[Russian][French]]]]]][|[Mandarin][Japanese]]]
\end{forest}