In [1]:
import numpy as np

class FastVector1:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('1') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
    
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [2]:
class FastVector2:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('2') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                ini=np.random.rand(300)
                self.embed[i] = ini[:] #elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
   
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [7]:
import numpy as np
ct1=0

In [3]:
def cosine_similarity(vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


In [4]:

def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1
        #print s1,s2
    distances = range(len(s1) + 1)
    #print distances
    #print enumerate(s2)
    for i2, c2 in enumerate(s2):
        #print i2,c2
        distances_ = [i2+1]
        #print distances_
        for i1, c1 in enumerate(s1):
            #print distances_,i1, c1, c2
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
        #print distances, char
        #if 0 not in distances and char=='@':
            #char=c2

        #if char in ['\'','k','g','j']:
            #dif=1
        #print 'hi', distances[-1], char
        #print char
    return distances[-1]

In [5]:
def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    ti=[]
    count=0
    for (source, target) in bilingual_dictionary:
        #print source,target
        if source in source_dictionary.word2id and target in target_dictionary.word2id:
            #print source, target
            count=count+1
            print source, target
            ti.append(target_dictionary.word2id[target])
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])
        if source in source_dictionary.word2id and target not in target_dictionary.word2id:
            for q in target_dictionary.word2id:
                dis=levenshteinDistance(target,q)
                if dis==1 :
                    count=count+1
                    print source, target,q
                    ti.append(target_dictionary.word2id[q])
                    source_matrix.append(source_dictionary[source])
                    target_matrix.append(target_dictionary[q])
                    #count+=1
                    #print p, q

    print count
    return np.array(source_matrix), np.array(target_matrix), np.array(ti)

In [9]:
en_dictionary = FastVector1(vector_file='/home/apatra/fastText/fastText_multilingual-master/eng.vec')
mi_dictionary = FastVector2(vector_file='/home/apatra/fastText/fastText_multilingual-master/model.vec')



reading word vectors from /home/apatra/fastText/fastText_multilingual-master/eng.vec
1
reading word vectors from /home/apatra/fastText/fastText_multilingual-master/model.vec
2


In [10]:
en_vector = en_dictionary["one"]
mi_vector = mi_dictionary["newt"]
print(cosine_similarity(en_vector, mi_vector))

0.053885064631976426


In [11]:
mi_words = set(mi_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())

In [17]:
import codecs
bilingual_dictionary=[]
with codecs.open('/home/apatra/fastText/fastText_multilingual-master/eng-mic','r','utf-8') as f:
    for line in f:
        eng, mic=line.split(', ')
        #print eng
        eng=eng.strip('\"')
        #print eng
        mic=mic.strip('\"')
        mic=mic.replace('\n','')
        mic=mic.replace('"','')
        #print eng, mic
        bilingual_dictionary.append((eng,mic))
#print bilingual_dictionary

In [13]:
# form the training matrices
#from copy import deepcopy
ct1=0
source_matrix, target_matrix ,ti= make_training_matrices(
    en_dictionary, mi_dictionary, bilingual_dictionary)
print len(source_matrix), len(target_matrix)
# learn and apply the transformation
print ti
#target_matrix=deepcopy(source_matrix)
print source_matrix[60][9], target_matrix[60][9]
#transform = learn_transformation(source_matrix, target_matrix)
#print type(transform)
#print transform[299]
#en_dictionary.apply_transform(transform)

  from ipykernel import kernelapp as app


txt txt tet
all msʼt mset
all msʼt msit
all msʼt ms't
choose megnatl
German alman klman
good geluʼlg gelu'lg
good geluʼsit gelusit
goodbye atiu atu
I nin
Micmac míkmaq mikmaq
Micmac Miʼkmaq Mi'kmaq
Micmac Miʼkmaq Mikkmaq
Micmac Miʼkmaq Mikmaq
abandon naqtʼg naqt'g
aboard tepteg telteg
aboard teppit
aboriginal Lnu
abstruse temig
ache gesnugwaj gesnugwat
ache gesnugwaj gesnugwa
ache gesnugwaj gsnugwaj
add mawgitg mawgi'g
adequate tepiet
adequate tepiaq
affect mesiatl mesimatl
again app
alive mimajit
alive mimajig mimaji
alive mimajig mimajit
alive mimajig mimajis
allow ignmuatl
almost suel
also ngutei nguti
also elg
also jel
always apjiw
and jel
and aq
angel ansaleʼwit ansale'wit
animal iku pku
animal iku isku
animal iku ika
another igtig
answer asitematl asitemat
anyway meʼ me'
anyway meʼ men
anyway meʼ mej
anyway meʼ me
arrive pegising
arrive igaʼq iga'q
aside gmetug
ask etamatl etamat
asleep nepat
aspen miti mitji
aspen miti mita
assist apoqonmuatl apoqonmuate
assistance apoqonmati ap

know nenuatl nenuat
laborer lugowinu
lamb jijgluewjiʼj jijgluewji'j
land maqamigew
later gneʼg gneg
law tplutaqan
leach squ su
leaf nipi napi
leaf nipi nini
leaf nipi nipk
leaf nipi nipe
leaf nipi nip
leak espeg espe'g
leak espeg espe'
leak espet espe'
lean-to anapig anapiw
lean-to anapig anapi
learning ginaʼmasuti gina'masuti
leave majaʼsit maja'sit
ledge mtasoq
leftover esgwiet
leftover esgwiaq
lesson ginaʼmasuti gina'masuti
lie egsuoʼqon egsuo'qon
lie egsuet egsue
lie egsuet egsueg
life mimajuaqan
like gesalatl
like gesatg
lip nsi nsis
lip nsi lsi
lip nsi isi
lip nsi qsi
lip nsi nui
lip nsi ni
lip nsi ns
lip nsi msi
lip nsi si
lip nsi nki
lip nsi ksi
liver usgun usgupn
living mimajuaqan
loath masgeltʼg masgelt'g
loath masgelmatl
located eteg
located epit
loon gwimu Kwimu
lose entoq eltoq
loud gesigawweg gesigawwe
machine mulin
made gisiatl kisiatl
make eltoq
make eliatl eliaql
make eliatl ejiatl
make eliatl elietl
make etlitoq etlintoq
making etliatl etliaql
man jiʼnm ji'nm
man jine

us ginu
very lnim ltim
view angamatl
village gjiganjiʼj gjiganji'j
visit emittugwet mittugwet
war matntimg
waterfowl sisip lisip
waterfowl sisip sisipk
waterfowl sisip sisi
wave tgu tu
we ginu
weak menaqanaq menaqana
weak menaqanat menaqana
wear nasgʼg nasg'g
week aqantieʼuti aqantie'uti
weep atgitemit
weigh tetpaqq tetpaqi
what goqwei
when taʼn ta'n
when taʼn taqn
when taʼn tan
where taʼn ta'n
where taʼn taqn
where taʼn tan
where tami
while geʼs ge's
whisper gimewistoq Gimewistoq
window tuopʼti tuopiti
wine moqopaʼq moqopa'q
winter gesig gesg
wolf paqtesm
wolf paqtɨsm paqtesm
wolf paqtɨsm paqtism
woman ebit epit
woman e'pit
woman eʼpit e'pit
woman eʼpit epit
word glusuaqan
work elugwet
work lugowaqan
work elugweg elugwe
work elugweg elugwet
work lgowaqan lugowaqan
work lgowaqan 'lgowaqan
worker lugowinu
write ewiʼgiget ewi'giget
yard igaʼtaqan iga'taqan
yard lapeʼlis lapelis
yeah eʼe eke
yeah eʼe ete
yeah eʼe ewe
yeah amuj
yep eʼe eke
yep eʼe ete
yep eʼe ewe
yep amuj
yes eʼe eke
yes e

In [14]:
from collections import defaultdict

def list_duplicates(seq, x):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return (locs for key,locs in tally.items() 
            if key==x)

'''
ind=6
p=list_duplicates(ti,ti[ind])
#for l in p:
 #   print l
j=np.zeros(300)
print source_matrix[ind]
print source_matrix[191]
for l in p:
    for x in l:
        j+=source_matrix[x]
            
    target_matrix[ind]=j[:]/len(l)
print target_matrix[ind]
'''

'\nind=6\np=list_duplicates(ti,ti[ind])\n#for l in p:\n #   print l\nj=np.zeros(300)\nprint source_matrix[ind]\nprint source_matrix[191]\nfor l in p:\n    for x in l:\n        j+=source_matrix[x]\n            \n    target_matrix[ind]=j[:]/len(l)\nprint target_matrix[ind]\n'

In [15]:
import copy
count_no=0
j=np.zeros(300)
for r in range(0,ct1):
    #print source_matrix[r], target_matrix[r]
    #print len(source_matrix[r]),len(target_matrix[r])
    p=list_duplicates(ti,ti[r])
    j=np.zeros(300)
    for l in p:
        for x in l:
            j+=source_matrix[x]
            
        target_matrix[r]=j[:]/len(l)
    count_no+=1
    #target_matrix[r]=source_matrix[r][:]
    mi_dictionary.apply_cop(target_matrix[r],ti[r])

print count_no

962


In [16]:
mi_dictionary.export('/home/apatra/fastText/fastText_multilingual-master/micmaq4.vec')