## Class formation

In [1]:
import numpy as np

class FastVector1:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('1') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                #ini=np.random.rand(300)
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
    
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [2]:
class FastVector2:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('2') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                ini=np.random.rand(300)
                self.embed[i] = ini[:] #elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
   
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [3]:
def cosine_similarity(vec_a, vec_b):
    """Compute cosine similarity between vec_a and vec_b"""
    return np.dot(vec_a, vec_b) / \
        (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

In [4]:
import numpy as np

## convert to matrices

In [5]:
def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    ti=[]
    count=0
    for (source, target) in bilingual_dictionary:
        #print source,target
        if source in source_dictionary.word2id and target in target_dictionary.word2id:
            #print source, target
            count=count+1
            print source, target
            ti.append(target_dictionary.word2id[target])
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    print count
    return np.array(source_matrix), np.array(target_matrix), np.array(ti)

## import the word embeddings

In [6]:
en_dictionary = FastVector1(vector_file='/home/apatra/fastText/fastText_multilingual-master/eng.vec')
mi_dictionary = FastVector2(vector_file='/home/apatra/fastText/fastText_multilingual-master/model.vec')

en_vector = en_dictionary["one"]
mi_vector = mi_dictionary["newt"]
print(cosine_similarity(en_vector, mi_vector))

reading word vectors from /home/apatra/fastText/fastText_multilingual-master/eng.vec
1
reading word vectors from /home/apatra/fastText/fastText_multilingual-master/model.vec
2
0.07516805902007641


In [7]:
#mi_dictionary = FastVector(vector_file='/home/apatra/fastText/fastText_multilingual-master/mic.vec')

In [8]:
#print en_dictionary.word2id.keys()
#print mi_dictionary.word2id.keys()

In [9]:
mi_words = set(mi_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())

## import the bilingual dictionary

In [10]:
import codecs
bilingual_dictionary=[]
with codecs.open('/home/apatra/fastText/fastText_multilingual-master/eng-mic','r','utf-8') as f:
    for line in f:
        eng, mic=line.split(', ')
        #print eng
        eng=eng.strip('\"')
        #print eng
        mic=mic.strip('\"')
        mic=mic.replace('\n','')
        mic=mic.replace('"','')
        #print eng, mic
        bilingual_dictionary.append((eng,mic))
print bilingual_dictionary

[(u'txt', u'txt'), (u'all', u'ms\u02bct'), (u'choose', u'megnatl'), (u'choose', u'megng'), (u'German', u'alman'), (u'good', u'amiglu\u02bcsit'), (u'good', u'amiglu\u02bclg'), (u'good', u'gelu\u02bclg'), (u'good', u'gelu\u02bcsit'), (u'goodbye', u'atiu'), (u'I', u'nin'), (u'May', u'Sqoljuigu\u2019s'), (u'May', u'Sqoljuigu\u02bcs'), (u'Micmac', u'Mi\u2019gmaq'), (u'Micmac', u'Mi\u2019gmawi\u2019simg'), (u'Micmac', u'Mi\u02bcgmaq'), (u'Micmac', u'Mi\u02bcgmaw'), (u'Micmac', u'm\xedkmaq'), (u'Micmac', u'Mi\u02bckmaq'), (u'Micmac', u'm\xedgmaq'), (u'Mi\u2019kmaq', u'M\xedkmaw\xedsimk'), (u'Mi\u2019kmaq', u'Mi\u02bcgmaq'), (u'Mi\u2019kmaq', u'm\xedkmaq'), (u'Mi\u2019kmaq', u'Mi\u02bckmaq'), (u'Mi\u2019kmaq', u'm\xedgmaq'), (u'Mohawk', u'gwatej'), (u'Newfoundland', u'Taqamkuk'), (u'search', u'gwilg'), (u'search', u'gwiluasit'), (u'stop', u'enqa\u02bclatl'), (u'stop', u'enqa\u02bcs\u02bcg'), (u'translation', u'nesutmalsewu\u02bcti'), (u'elderly woman', u'gisigui\u02bcsgw'), (u'is that so', u't

## convert to matrix form

In [11]:
# form the training matrices
#from copy import deepcopy
source_matrix, target_matrix ,ti= make_training_matrices(
    en_dictionary, mi_dictionary, bilingual_dictionary)
print len(source_matrix), len(target_matrix)
# learn and apply the transformation
print ti
#target_matrix=deepcopy(source_matrix)
print source_matrix[60][9], target_matrix[60][9]
#transform = learn_transformation(source_matrix, target_matrix)
#print type(transform)
#print transform[299]
#en_dictionary.apply_transform(transform)

choose megnatl
I nin
aboard teppit
aboriginal Lnu
abstruse temig
adequate tepiet
adequate tepiaq
again app
alive mimajit
allow ignmuatl
almost suel
also elg
also jel
always apjiw
and jel
and aq
another igtig
arrive pegising
aside gmetug
asleep nepat
at eteg
attached naspit
attached nasteg
authority alsusuti
aware gejiatl
ay amuj
aye amuj
battle matntimg
battle matnaggewaqan
bear muin
beaver kopit
because muta
before tmg
before gesgmnaq
beside gmetug
beside anapiw
blacksmith klaptan
blaze gnugwaqan
bleed maltewiaq
bless elapatoq
boss alsusit
boss assusit
both gitg
build eltoq
but gatu
Canada Ganata
canoe kwitn
caribou qalipu
chase getanatl
city gjigan
cliff mtasoq
cloud alug
cod peju
completely lpa
confess agnutg
cry etltemit
cry atgitemit
deep temig
deer lentuk
detest masgelmatl
direct assusit
direct alsusit
discuss agnutmajig
discussion agnutmaqan
disobey elistuatl
drink esamqwat
eagle kitpu
earn eltoq
eel katew
eight ukmuljin
ended gaqiaq
endure saputaqatg
enjoys gesatg
enmeshed nast

## getting one word-different meaning indices

In [12]:
from collections import defaultdict

def list_duplicates(seq, x):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return (locs for key,locs in tally.items() 
            if key==x)

'''
ind=6
p=list_duplicates(ti,ti[ind])
#for l in p:
 #   print l
j=np.zeros(300)
print source_matrix[ind]
print source_matrix[191]
for l in p:
    for x in l:
        j+=source_matrix[x]
            
    target_matrix[ind]=j[:]/len(l)
print target_matrix[ind]
'''

'\nind=6\np=list_duplicates(ti,ti[ind])\n#for l in p:\n #   print l\nj=np.zeros(300)\nprint source_matrix[ind]\nprint source_matrix[191]\nfor l in p:\n    for x in l:\n        j+=source_matrix[x]\n            \n    target_matrix[ind]=j[:]/len(l)\nprint target_matrix[ind]\n'

## seeding the average of english WE to micmaq WE

In [13]:
import copy
j=np.zeros(300)
for r in range(0,len(ti)):
    #print source_matrix[r], target_matrix[r]
    #print len(source_matrix[r]),len(target_matrix[r])
    p=list_duplicates(ti,ti[r])
    j=np.zeros(300)
    for l in p:
        for x in l:
            j+=source_matrix[x]
            
        target_matrix[r]=j[:]/len(l)
    #target_matrix[r]=source_matrix[r][:]
    mi_dictionary.apply_cop(target_matrix[r],ti[r])

## Result-check, of translation with different meanings

In [14]:
print (en_dictionary["yes"],mi_dictionary["amuj"])

(array([-5.000e-04,  6.570e-02, -6.570e-02, -1.426e-01,  1.085e-01,
        4.980e-02,  5.360e-02, -1.374e-01, -1.848e-01, -1.028e-01,
        1.840e-01,  3.610e-02,  4.927e-01, -8.400e-02,  7.250e-02,
       -9.890e-02, -2.800e-03, -6.300e-02,  2.920e-01,  2.470e-01,
       -3.940e-02,  1.026e-01, -1.003e-01, -8.500e-02, -8.740e-02,
        1.700e-01, -2.310e-02,  2.840e-01,  3.510e-02,  5.830e-02,
       -1.588e-01, -1.417e-01, -2.671e-01, -6.440e-02, -1.969e-01,
       -3.116e-01,  7.150e-02, -2.150e-02, -6.660e-02,  5.900e-02,
       -1.899e-01,  4.412e-01,  1.159e-01,  1.300e-02,  9.660e-02,
        9.100e-03,  5.060e-02,  2.660e-02,  1.921e-01, -2.846e-01,
       -2.070e-01,  3.470e-02,  2.001e-01, -3.970e-02,  4.500e-02,
       -1.445e-01, -1.389e-01,  2.576e-01, -2.511e-01,  1.036e-01,
       -3.189e-01,  2.503e-01, -1.541e-01,  9.590e-02,  1.269e-01,
        2.391e-01,  2.280e-01, -1.160e-02, -1.650e-01,  5.700e-02,
       -1.296e-01, -5.770e-02, -1.157e-01,  1.487e-01,  1.158

## Result-check, of translation without different meanings

In [15]:
print (en_dictionary["plan"],mi_dictionary["ilsuteget"])

(array([ 1.175e-01, -5.000e-03, -2.030e-02, -5.660e-02,  7.460e-02,
       -1.253e-01,  4.880e-02,  2.400e-03, -1.480e-02,  1.046e-01,
       -4.005e-01,  1.199e-01, -3.690e-02,  9.020e-02,  8.930e-02,
       -2.169e-01, -2.619e-01, -2.430e-02, -2.895e-01,  2.109e-01,
        8.360e-02,  8.000e-04,  1.871e-01,  5.400e-02,  5.110e-01,
       -7.770e-02,  5.460e-02, -3.515e-01, -1.347e-01,  3.780e-01,
        3.820e-02, -3.720e-02,  1.185e-01, -1.662e-01, -1.990e-02,
       -3.114e-01,  3.330e-02, -1.310e-02, -1.561e-01,  2.550e-02,
        5.960e-02,  1.002e-01,  2.079e-01, -3.590e-02,  1.754e-01,
       -2.792e-01,  2.540e-01, -3.143e-01, -3.320e-02,  8.550e-02,
       -2.467e-01,  1.036e-01,  1.228e-01,  4.110e-02,  4.534e-01,
       -5.380e-02,  2.150e-02, -3.813e-01, -2.102e-01, -2.778e-01,
       -1.615e-01, -9.380e-02,  6.470e-02, -1.746e-01, -2.228e-01,
        2.964e-01,  2.564e-01,  2.549e-01,  1.489e-01, -2.610e-02,
       -4.580e-02,  1.630e-01,  1.564e-01,  1.847e-01, -1.215

## Exporting the vecor to file format

In [16]:
mi_dictionary.export('/home/apatra/Desktop/work/lstm/data/micmaq1.vec')

## rough work

In [None]:
print ti[23]
   

In [None]:
from copy import deepcopy

def apply_copy(source_matrix, target_matrix,i):
    target_matrix=deepcopy(source_matrix)
    mi_dictionary.apply_cop(target_matrix,i)

In [None]:
from collections import defaultdict

def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
        
    
    return ((key,locs) for key,locs in tally.items() 
                            if len(locs)>1 )


for dup in sorted(list_duplicates(ti)):

    print dup

In [None]:
mi_dict = FastVector(vector_file='/home/apatra/fastText/fastText_multilingual-master/mic.vec')
mi_dict1 = FastVector(vector_file='/home/apatra/fastText/fastText_multilingual-master/mic1.vec')

print(mi_dict["whitne"])
print (mi_dict1["whitne"])