# Loading word embeddings

In [1]:
import io
import numpy as np
import pandas as pd

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

# en-es.0-5000.txt     en-ru.0-5000.txt     fr-es.0-5000.txt     pt-de.0-5000.txt
# bn-en.5000-6500.txt               en-es.5000-6500.txt

def load_dict(dict_path="data/crosslingual/dictionaries/en-es.0-5000.txt"):
    
    return pd.read_csv(dict_path, names=["src", "tgt"], delim_whitespace=True)


def multi_key_dict(words, dict_):
    out = []
    for word in words:
        if word in dict_:
            out.append(dict_[word])
    return np.asarray(out)



In [3]:
en_es_dict = load_dict()
en_zh_dict = load_dict("data/crosslingual/dictionaries/en-zh.0-5000.txt")

In [4]:
src_path = 'data/wiki.en.vec'
tgt_path = 'data/wiki.es.vec'
tgt_path2 = 'data/wiki.zh.vec'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)
tgt_embeddings2, tgt_id2word2, tgt_word2id2 = load_vec(tgt_path2, nmax)

In [62]:
embeddings = ['data/wiki.en.vec', 
              'data/wiki.es.vec', 
              'data/wiki.zh.vec', 
              'data/wiki.ko.vec',
              'data/wiki.ru.vec',
              'data/wiki.ja.vec',
              'data/wiki.de.vec',
              'data/wiki.nl.vec',
              'data/wiki.fr.vec',
              'data/wiki.ar.vec']
dictionaries = ['en-en.0-5000.txt',
                'en-es.0-5000.txt',
                'en-zh.0-5000.txt',
                'en-ko.0-5000.txt',
                'en-ru.0-5000.txt',
                'en-ja.0-5000.txt',
                'en-de.0-5000.txt',
                'en-nl.0-5000.txt',
                'en-fr.0-5000.txt',
                'en-ar.0-5000.txt']
languages = ['English', 'Spanish', 'Mandarin', 'Korean', 'Russian', 'Japanese', 'German', 'Dutch', 'French', 'Arabic']

In [64]:
nmax = 50000  # maximum number of word embeddings to load
data = dict()
for l_names, path, mydpath in zip(languages, embeddings, dictionaries):
    emb, id2word, word2id = load_vec(path, nmax)
    en_to_x_dict = load_dict("data/crosslingual/dictionaries/" + mydpath)
    src = en_to_x_dict["tgt"].values
    ids = multi_key_dict(src, word2id)
    data[l_names] = emb[ids,:][:500,:]

In [5]:
src, tgt, tgt2, = en_es_dict["src"].values, en_es_dict["tgt"].values, en_zh_dict["tgt"].values

In [6]:
src_ids = multi_key_dict(src, src_word2id)
tgt_ids = multi_key_dict(tgt, tgt_word2id)
tgt_ids2 = multi_key_dict(tgt2, tgt_word2id2)

In [7]:
X , Y, Z = src_embeddings[src_ids, :], tgt_embeddings[tgt_ids,:], tgt_embeddings2[tgt_ids2,:]
X.shape, Y.shape, Z.shape

((11977, 300), (11374, 300), (8225, 300))

In [65]:
import numpy as np
import matplotlib.pyplot as plt
from ripser import Rips
from persim import bottleneck, sliced_wasserstein

In [66]:
dgrms = dict()
for language in languages:
    rips = Rips(maxdim=2)
    print("Language: ", language)
    dgrms[language] = rips.fit_transform(data[language], metric="cosine")

Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  English
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Spanish
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Mandarin
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Korean
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Russian
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Japanese
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  German
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  Dutch
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
Language:  French
Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbos

In [280]:
class Tree:
    def __init__(self, data, left=None, right=None):
        self.left = left 
        self.right = right
        self.data = data
    def __str__(self):
        return "Data: %s" % self.data

    @staticmethod
    def node_list(root):
        nlist = []
        queue = [root]
        while len(queue) > 0:
            node = queue.pop()
            if node.left:
                queue.append(node.left)
            if node.right:
                queue.append(node.right)
            else:
                nlist.append(node)
        return nlist
    
    def display(self, keys):
        lines, *_ = self._display_aux(keys)
        for line in lines:
            print(line)

    def _display_aux(self, keys):
        """Returns list of strings, width, height, and horizontal coordinate of the root."""
        # No child.
        if self.right is None and self.left is None:
            line = keys[self.data]
            width = len(line)
            height = 1
            middle = width // 2
            return [line], width, height, middle

        # Only left child.
        if self.right is None:
            lines, n, p, x = self.left._display_aux(keys)
            s = keys[self.data]
            u = len(s)
            first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s
            second_line = x * ' ' + '/' + (n - x - 1 + u) * ' '
            shifted_lines = [line + u * ' ' for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, n + u // 2

        # Only right child.
        if self.left is None:
            lines, n, p, x = self.right._display_aux(keys)
            s = keys[self.data]
            u = len(s)
            first_line = s + x * '_' + (n - x) * ' '
            second_line = (u + x) * ' ' + '\\' + (n - x - 1) * ' '
            shifted_lines = [u * ' ' + line for line in lines]
            return [first_line, second_line] + shifted_lines, n + u, p + 2, u // 2

        # Two children.
        left, n, p, x = self.left._display_aux(keys)
        right, m, q, y = self.right._display_aux(keys)
        s = keys[self.data]
        u = len(s)+1
        first_line = (x + 1) * ' ' + (n - x - 1) * '_' + s + y * '_' + (m - y) * ' '
        second_line = x * ' ' + '/' + (n - x - 1 + u + y) * ' ' + '\\' + (m - y - 1) * ' '
        if p < q:
            left += [n * ' '] * (q - p)
        elif q < p:
            right += [m * ' '] * (p - q)
        zipped_lines = zip(left, right)
        lines = [first_line, second_line] + [a + u * ' ' + b for a, b in zipped_lines]
        return lines, n + m + u, max(p, q) + 2, n + u // 2


def distance_matrix(nodes, tiny_dmatrix):
    """
    Computes distance matrix with complete linkage 
    """
    d_matrix = np.full((len(nodes), len(nodes)), np.nan)
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            nlist_1 = Tree.node_list(nodes[i])
            nlist_2 = Tree.node_list(nodes[j])
            ids_1 = [node.data for node in nlist_1]
            ids_2 = [node.data for node in nlist_2]
            d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids_1 for j in ids_2])
            d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids_1 for j in ids_2])
#             if isinstance(nodes[i], Tree) and not isinstance(nodes[j], Tree):
#                 nlist = Tree.node_list(nodes[i])
#                 ids = [node.data for node in nlist]
#                 d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids])
# #                 d_matrix[i,j] = min([tiny_dmatrix[i,j] for i in ids for j in ids])
#             elif isinstance(nodes[j], Tree) and not isinstance(nodes[i], Tree): 
#                 nlist = Tree.node_list(nodes[j])
#                 ids = [node.data for node in nlist]
# #                 d_matrix[i,j] = max([tiny_dmatrix[i,j] for i in ids for j in ids])
#                 d_matrix[i,j] = min([tiny_dmatrix[i,j] for i in ids for j in ids])
#             elif isinstance(nodes[i], Tree) and isinstance(nodes[j], Tree):
                
#             else: 
#                 d_matrix[i,j] = tiny_dmatrix[nodes[i].data, nodes[j].data]
    return d_matrix
    
def hclustering(dgrms, homology=0, dist='sw'):
    nodes = [Tree(i) for i in range(len(dgrms))]
    new_dgrms = [dgrms[i][homology] for i in dgrms]
    tiny_dmatrix = np.full((len(nodes), len(nodes)), np.nan)                
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if dist == 'sw':
                tiny_dmatrix[i,j] = sliced_wasserstein(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
            else:
                tiny_dmatrix[i,j] = bottleneck(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
            tiny_dmatrix[j,i] = tiny_dmatrix[i,j]
#     langs = list(dgrms.keys())
#     langs.append('')
    while len(nodes) > 1:
#         a =[print(node) for node in nodes]
#         a = [node.display(langs) for node in nodes]
        d_matrix = distance_matrix(nodes, tiny_dmatrix)
#         print(d_matrix)
        i, j = np.unravel_index(np.nanargmin(d_matrix), d_matrix.shape)
        print("The minimum is ", d_matrix[i,j])
        node = Tree(-1, left=nodes[i], right=nodes[j])
        nodes = [nodes[k] for k in range(len(nodes)) if k not in [i,j]]
        nodes.append(node)        
    return nodes[0], tiny_dmatrix
    
def hclustering_all(dgrms, dist='sw'):
    nodes = [Tree(i) for i in range(len(dgrms))]
#     new_dgrms = [dgrms[i][homology] for i in dgrms]
    new_dgrms = [np.vstack((dgrms[i][0], dgrms[i][1], dgrms[i][2])) for i in dgrms]
    tiny_dmatrix = np.full((len(nodes), len(nodes)), np.nan)                
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if dist == 'sw':
                tiny_dmatrix[i,j] = sliced_wasserstein(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
            else:
                tiny_dmatrix[i,j] = bottleneck(new_dgrms[nodes[i].data], new_dgrms[nodes[j].data])
            tiny_dmatrix[j,i] = tiny_dmatrix[i,j]
    langs = list(dgrms.keys())
    langs.append('')
    while len(nodes) > 1:
#         a =[print(node) for node in nodes]
        a = [node.display(langs) for node in nodes]
        d_matrix = distance_matrix(nodes, tiny_dmatrix, new_dgrms)
        i, j = np.unravel_index(np.nanargmin(d_matrix), d_matrix.shape)
        print("The minimum is ", d_matrix[i,j])
        node = Tree(None, left=nodes[i], right=nodes[j])
        nodes = [nodes[k] for k in range(len(nodes)) if k not in [i,j]]
        nodes.append(node)        
    return nodes[0], tiny_dmatrix
    

In [91]:
q_, tiny = hclustering_all(dgrms)

In [265]:
langs = list(dgrms.keys())
langs.append('')

In [283]:
q1, tiny = hclustering(dgrms,homology=2,dist='sw');

The minimum is  0.006297610371318185
The minimum is  1.029602312725588
The minimum is  1.2852748297401468
The minimum is  1.5694235805241241
The minimum is  2.2696271079093346
The minimum is  3.0640762510677306
The minimum is  4.8399114986486085
The minimum is  7.33214264187133
The minimum is  10.27542890036365


In [78]:
for lang in languages:
    plt.figure()
    rips.plot(dgrms[lang])
    plt.title(lang)
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.savefig('pds/' + lang + '.png', dpi=600, bbox_inches = 'tight', pad_inches = 0.2)
    plt.close()

In [284]:
q1.display(langs)

    ________                                                              
   /         \                                                             
Korean     ___________________                                            
          /                    \                                           
       Arabic          _______________                                    
                      /                \                                   
                   _______         ________________________              
                  /        \       /                         \             
              Mandarin Japanese Russian         ____________________      
                                               /                     \     
                                            _________            _____   
                                           /          \          /      \  
                                        Spanish     _____    English Dutch
                     

In [323]:
def make_latex_tree(tree,langs):
    print(chr(92)+'begin{forest}\n[',end='')
    _make_latex_tree(tree,langs)
    print(']\n'+ chr(92) +'end{forest}',end='')

def _make_latex_tree(tree, langs):
    if not tree.left and not tree.right:
        print(langs[tree.data], end='')
    elif tree.data:
        print('|[', end='')
        _make_latex_tree(tree.left,langs)
        print(']',end='')
        print('[',end='')
        _make_latex_tree(tree.right,langs)
        print(']',end='')    

In [324]:
make_latex_tree(q1,langs)

\begin{forest}
[|[Korean][|[Arabic][|[|[Mandarin][Japanese]][|[Russian][|[|[Spanish][|[German][French]]][|[English][Dutch]]]]]]]
\end{forest}

In [320]:
not None

True