In [1]:
import numpy as np

class FastVector1:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('1') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
    
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [2]:
class FastVector2:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
	    print ('2') 
            (self.n_words, self.n_dim) = \
            (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                #print (elems[0])
                ini=np.random.rand(300)
                self.embed[i] = ini[:] #elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)
    
    def apply_cop(self, matrix,i):
        self.embed[i]=matrix[:]
    
    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    
    
    @classmethod
   
    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [15]:
import numpy as np
from random import randint
ct1=0

In [16]:
def cosine_similarity(vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


In [86]:

def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1
        #print s1,s2
    distances = range(len(s1) + 1)
    #print distances
    #print enumerate(s2)
    for i2, c2 in enumerate(s2):
        #print i2,c2
        distances_ = [i2+1]
        #print distances_
        for i1, c1 in enumerate(s1):
            #print distances_,i1, c1, c2
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
        #print distances, char
        #if 0 not in distances and char=='@':
            #char=c2

        #if char in ['\'','k','g','j']:
            #dif=1
        #print 'hi', distances[-1], char
        #print char
    return distances[-1]

In [129]:
##edit distance

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    ti=[]
    count=0
    for (source, target) in bilingual_dictionary:
        #print source,target
        if source in source_dictionary.word2id and target in target_dictionary.word2id:
            #print source, target
            count=count+1
            #print source, target
            ti.append(target_dictionary.word2id[target])
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])
        if source in source_dictionary.word2id and target not in target_dictionary.word2id:
            for q in target_dictionary.word2id:
                dis=levenshteinDistance(target,q)
                if dis==1 :
                    count=count+1
                    #print source, target,q
                    ti.append(target_dictionary.word2id[q])
                    source_matrix.append(source_dictionary[source])
                    target_matrix.append(target_dictionary[q])
                    #count+=1
                    #print len(source_matrix),ti
                    #break
                    #print p, q
    ct1=count
    print count
    return source_matrix, target_matrix, ti,count

In [130]:
####random english word vectors

def make_training_matrices2(source_dictionary, target_dictionary, bilingual_dictionary,source_matrix, target_matrix ,ti):
    print len(source_matrix)#, ti
    for target in target_dictionary.word2id:
        if target_dictionary.word2id[target] not in ti:
            
            x=randint(0,len(source_dictionary.word2id))
            #count=count+1
            print x
            source1= source_dictionary.id2word[x]
            #print  target, source1
            ti.append(target_dictionary.word2id[target])
            source_matrix.append(source_dictionary[source1])
            target_matrix.append(target_dictionary[target])
            #print len(source_matrix), ti
            #break
    # return training matrices
    #ct1=count
    #print count
    return np.array(source_matrix), np.array(target_matrix), np.array(ti)

In [89]:
en_dictionary = FastVector1(vector_file='/home/apatra/fastText/fastText_multilingual-master/eng.vec')


reading word vectors from /home/apatra/fastText/fastText_multilingual-master/eng.vec
1


In [173]:
mi_dictionary = FastVector2(vector_file='/home/apatra/fastText/fastText_multilingual-master/model.vec')

reading word vectors from /home/apatra/fastText/fastText_multilingual-master/model.vec
2


In [174]:
en_vector = en_dictionary["one"]
mi_vector = mi_dictionary["newt"]
print(cosine_similarity(en_vector, mi_vector))

0.07795933150257374


In [175]:
mi_words = set(mi_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())

In [176]:
import codecs
bilingual_dictionary=[]
with codecs.open('/home/apatra/fastText/fastText_multilingual-master/eng-mic','r','utf-8') as f:
    for line in f:
        eng, mic=line.split(', ')
        #print eng
        eng=eng.strip('\"')
        #print eng
        mic=mic.strip('\"')
        mic=mic.replace('\n','')
        mic=mic.replace('"','')
        #print eng, mic
        bilingual_dictionary.append((eng,mic))
#print bilingual_dictionary

In [135]:
# form the training matrices
#from copy import deepcopy
ct1=0
source_matrix1, target_matrix1 ,ti1,ct1= make_training_matrices(
    en_dictionary, mi_dictionary, bilingual_dictionary)

  from ipykernel import kernelapp as app


962


In [177]:
print len(source_matrix1),ti1

source_matrix, target_matrix ,ti= make_training_matrices2(
    en_dictionary, mi_dictionary, bilingual_dictionary,source_matrix1[:], target_matrix1[:] ,ti1[:])
print len(source_matrix), len(target_matrix)
# learn and apply the transformation
#print ti
#target_matrix=deepcopy(source_matrix)
#print source_matrix[60][9], target_matrix[60][9]
#transform = learn_transformation(source_matrix, target_matrix)
#print type(transform)
#print transform[299]
#en_dictionary.apply_transform(transform)

962 [175, 515, 33, 966, 17377, 4297, 5192, 15063, 13544, 1474, 4529, 438, 7043, 89, 1627, 1360, 16835, 7099, 16371, 5110, 13190, 13133, 16534, 9254, 578, 4208, 52, 13258, 1363, 13258, 7586, 17285, 483, 2129, 648, 19, 759, 19, 1, 19431, 1700, 16307, 967, 629, 131, 71, 18865, 7653, 164, 1137, 330, 3340, 16511, 3858, 10085, 26, 19013, 2030, 13079, 683, 1350, 5357, 2411, 5113, 8144, 349, 2582, 945, 8144, 349, 2582, 945, 9182, 1350, 1479, 1272, 16998, 3339, 16869, 17133, 16938, 1570, 3854, 2232, 2232, 6323, 1291, 1291, 38, 1499, 3420, 21226, 16231, 9, 14816, 13544, 945, 1185, 316, 17252, 16555, 5543, 1604, 13488, 3340, 12258, 21221, 13779, 752, 752, 6093, 6354, 6843, 17186, 19918, 11959, 11959, 14274, 17407, 18793, 3964, 19445, 17991, 13039, 18683, 18671, 14149, 18683, 18683, 17227, 2260, 3637, 6097, 2721, 2036, 85, 2609, 20739, 21079, 4977, 764, 1364, 1714, 2688, 15027, 6824, 4965, 7835, 3391, 12040, 21410, 17238, 1365, 43, 16932, 16472, 16956, 20368, 20368, 6636, 17158, 17377, 9836, 3033,

813731
1443449
1840440
99532
1346393
975621
1042709
1272647
262623
168979
1383768
910332
1483803
1070372
1301300
511202
704591
1054847
1935875
951377
16625
146120
441227
301369
1172984
1814580
854502
1468852
161816
502300
672997
872734
166436
1911796
1865544
624478
1409598
323331
1197872
796859
33698
1049461
1813478
1842112
222101
332849
1984627
1358564
664965
1872256
964656
816351
1908354
801160
1524864
134765
1090585
690971
1327589
1738395
602637
16474
474189
1186378
11888
1679552
1899084
587018
1187769
1285008
1803205
477162
1438934
163215
425058
1033024
1778415
996202
1174806
1048710
120484
155884
620632
1704918
1026789
1868142
875560
951887
1930345
1970530
268525
90590
169207
798892
79931
1385840
254930
1497146
1809886
856530
1306224
245379
488246
1692046
1576236
1622022
1739871
1511865
933053
1695005
907496
200664
1435484
534363
1855811
403344
410204
363185
247371
1878873
572081
1288932
406903
976975
1616723
882253
503477
1455152
305475
1990280
1971362
807991
1092473
151286
38871

3006
997840
1440087
751432
1898427
1711313
1645958
1900295
1076812
1737359
1094565
1646220
1920159
1329480
562710
1326221
714242
427577
1950659
1007042
665259
1159357
184715
1707572
1766828
764447
438358
786601
1692930
726911
964785
1671713
1068446
951575
1806656
741569
838106
461702
383906
344675
1966660
1158613
350153
1068957
1124322
828640
214508
470631
1946451
763810
166032
395167
538964
1757027
1644531
232297
433125
883263
274485
177726
1966507
1164415
1082438
1140885
1161751
1107914
879976
320747
635757
1602871
1887860
855903
373227
984478
577231
229900
1247794
577042
822938
978204
184583
1672649
249275
21549
89295
1926646
1076877
1836566
1607335
118005
297066
1210428
689335
1552937
620121
264526
850849
921076
1877574
407550
1028459
1587405
1522245
665017
947207
1740988
841013
1919407
930110
628565
1212767
1962164
1241972
75324
1647107
1071862
920719
1192996
1546126
650459
467331
931905
118965
458180
876716
799753
1807093
42581
1008064
1110307
166683
1741721
1489944
1026256
46030

1454107
1440682
1117460
1438793
633385
318371
573046
1674279
404324
1759364
1934704
715065
536106
728445
1337971
438477
1670162
873480
1913402
1849289
431091
1614209
256773
1748596
1991388
596126
569205
607268
820644
1973469
1134415
15393
1226040
999641
1242805
276526
1050094
1674062
577906
56135
540717
1552459
1914155
1880173
488834
1778424
996280
1188452
628018
1517745
373043
641064
1207063
1723981
66317
254990
1287692
1037483
731334
1697294
1435743
767694
1169536
1539327
1549138
1011167
1400187
1946912
723446
476546
89685
1777829
1141573
567704
1473957
1464870
1382487
497412
392461
1642131
1206573
949907
504225
1428319
1548422
121676
902947
1742248
62310
795102
1520160
1196363
1094418
509161
1012507
1114694
8223
1696308
230281
492907
1855818
596453
885111
1026331
1980315
1523648
655269
1808830
1873180
1041844
1286611
851470
1632987
378893
63397
1766451
1800921
509590
1135338
377979
81290
1348053
1419348
210905
932848
1958703
210607
1638961
218885
1837906
977632
1654881
904979
390546

1280867
933401
898687
1968634
1778814
24480
1119292
1114169
1026059
1546042
840051
364096
1473839
208302
1980572
910759
1079453
820281
1037928
424957
1063753
1400590
1527937
284452
253957
1804212
497125
1442238
1840449
217695
719962
242003
1677058
404126
28498
1630249
949729
83273
741970
1574361
1454853
1808180
594534
1833881
1438897
443614
1180907
1449878
716366
1345711
241274
1448276
990063
1909238
183579
1955524
425869
1279706
742120
757932
1973535
283976
1264581
1109626
96990
1708082
979988
1286061
1286835
614594
52552
698327
1579659
256208
1987552
562880
1311626
1949152
591466
658670
1583619
1555365
830214
1729581
1617103
450089
1208308
1845998
974031
1530916
1136980
1532539
553217
622235
70028
819965
778819
487793
799262
1506553
1277966
1761103
1945524
1555123
1589793
585973
369286
1481908
902127
1806135
132282
90276
1841228
1455366
1822208
1414031
1468011
1620542
1568310
1100474
1949717
369046
563497
1380816
119856
1031087
1873145
10263
1755575
1411908
819655
348554
758888
26205

815689
1838549
1112690
1149634
425797
143653
1583540
1045460
126264
334316
174903
1327694
30042
1853788
929693
836222
614977
196052
201799
611154
1348580
739800
863416
553813
1505235
278830
251202
1852371
608980
1915926
345654
484925
1931717
574348
947019
1218174
1525593
1982778
1885172
1485690
157232
34150
1849867
1762152
1675681
1784580
1809973
833195
1475252
1029008
577603
1140589
198491
108754
161103
1084430
1900739
1298844
1582216
150068
1244449
219037
335479
1176498
249408
209007
13386
879883
1371985
1078488
1144354
1255401
137352
1432620
262548
374256
620742
1222312
1427299
259872
1163513
101312
1896011
123053
919188
41132
1813285
942778
79321
741744
1890288
850105
79202
928912
859760
1733142
1229702
1426900
1784588
1365362
1449963
482628
390668
82649
550907
1181632
1256161
1115962
649515
603847
1121917
490910
1386905
185014
1996255
647794
1683956
1435957
327534
1408148
465390
1219158
1018610
899576
1879295
1928291
1840362
1862127
550148
1389664
1840715
873119
321285
110317
1013

1270119
1564861
194168
1215985
718773
118463
936733
178131
1159668
470425
360789
1707033
888199
1965216
1403571
1249749
172684
756754
1240667
586292
945300
271756
1513679
1737563
785678
1946066
145180
1766232
50071
1457998
1690202
1730159
777181
885702
160340
201476
1616266
879667
903352
1695010
952246
1042398
788098
636265
1524142
1215871
281091
1301537
1631060
209972
250659
1230015
1537116
1062130
1818354
1497795
92770
982163
1509096
356356
287699
1383016
1500064
502399
1110054
81320
603294
1404063
1010978
29084
1621833
1925242
1795834
307878
1575543
624583
1445521
287841
1331039
875672
1166563
1048856
1730909
344406
1378595
1385356
145723
991522
1082967
286277
1339636
1851168
1911640
1399674
1174625
368271
160378
6113
1464557
747037
1705515
1057406
1351415
1383485
767992
1002903
507252
1038648
1476577
1772326
333029
1761255
1051499
1147806
702154
292877
569870
621709
1829997
529457
969049
1929486
1491409
480255
1489258
548116
559456
290891
1512016
1231828
427755
1744534
1690214
3767

1441181
1784823
805991
1039087
626141
1681402
724302
1662796
1945861
552345
548844
268492
1112969
66540
442624
827318
1380825
1940554
434823
1532606
1243755
1624140
530367
495572
1257476
163644
788251
308113
594886
1556049
1436811
301982
909555
1877848
1816873
167258
883045
199087
332259
1505656
1315383
845186
56561
94500
70641
678923
1738137
1680938
1990107
1264758
615317
1608470
1527350
1170270
1320346
1513106
331889
67199
1173534
1033078
536722
1876673
1969050
236042
757489
1537749
1195357
1825173
85953
1875578
384765
1064535
1112906
187525
836213
465360
1906093
167705
226448
819754
1669879
1433839
868547
1839547
1468164
1707589
54027
1014570
1775907
579870
986784
1792965
1939398
1093533
889041
1296808
162571
1576628
1469471
1689851
1091
111695
711282
488222
502526
1077569
1485539
1185099
1493030
1170694
363623
365165
861738
1903812
1315329
1704887
1237975
56079
1229055
337545
1803359
324237
1070041
691603
1081367
1891405
259068
1885758
1047363
919615
1519794
998975
226568
1493634
8

562363
939579
1221846
1567895
1977795
858153
6419
1677497
297359
1504669
1127647
960728
750121
670082
1765058
28146
51245
1171840
155864
266193
895074
1313561
1907473
971911
643602
633756
1239560
1449530
453571
1737222
141102
835016
1736660
1373371
672858
98803
382593
359971
1849389
1271592
1151417
821053
1246143
301823
1005704
36358
247548
1990343
247667
500201
1012261
559499
230007
1888946
908784
150672
809602
607637
540717
1332854
1102163
150790
1629645
244986
758615
482845
1419034
1287658
502230
1676786
1608376
1402188
1378807
1657242
1550941
1505343
1260084
1877986
1984751
1803992
1016347
125536
1266315
1017463
1543930
316175
120350
255070
248272
1415412
1723004
1624657
1130497
1819514
1110078
1929347
1688671
423199
339731
1101133
211109
1920488
426978
1699454
1081925
587468
270624
347327
1663443
884935
879875
377689
1557731
403698
464662
384145
1637928
363376
912241
1657269
477301
554140
1529649
86228
1185147
1642511
1172047
1361683
184429
1323017
1782323
1054064
568584
922533
28

299305
752929
1008141
1287657
1250548
1297116
119938
1050838
107766
985901
948272
286337
524590
1984436
1365563
621079
1650652
1848244
512983
401835
1818079
732742
623351
865301
1951953
521901
1614574
1470026
714731
957964
1917474
92057
1620107
50607
952842
1528181
1955096
393430
1431685
1571085
892639
274651
1321068
637378
1738491
265417
1584020
1600049
1611699
200187
681068
1825795
1989820
973841
732050
1514633
1952716
684355
316228
131623
1899472
1307303
1495222
655002
725008
1104722
29678
1107688
542752
1951861
1299757
497955
1463935
1290207
1920164
664421
863713
1117383
7341
1292337
51341
559594
130610
895256
196204
417175
796708
901692
1954235
1694914
736880
191303
181855
1931995
630877
1892206
1652256
1977102
1152974
784687
327014
1201842
1880155
749392
233358
1483230
1590622
911173
1805822
26907
953883
328853
1313684
308747
1678943
404105
799842
1659152
96080
1472502
1893329
1689378
1821403
983481
1096892
1349889
263370
1586676
1389136
56419
324425
1014265
1909356
695006
535226

919967
1092669
280296
1687844
1265514
1563402
1636335
1439122
244251
98845
1104025
1304080
879259
1328527
672054
417879
1660135
1850083
1898513
490095
813653
1105609
1846300
997856
539547
481140
1163780
594112
233027
1920901
528862
1484002
487343
1001945
592542
794923
697429
472676
933944
140299
1284415
742679
530010
1779672
1517481
1070354
679423
1393039
1560053
1975659
79191
942547
116776
599156
1303419
1858479
968888
72453
324134
667757
1487066
746319
915357
760039
957369
1041226
1701953
1580959
642299
1359561
1919606
491568
1733927
518292
1330567
1137932
1052779
1364738
473479
1086023
760320
1658969
1383242
624939
1981521
269887
1177325
1240664
153039
1435428
551356
56778
1238840
1824231
185901
1343880
1439516
772068
1051424
317201
1522094
1232576
1192177
1182750
655749
1176954
1639853
1630357
1622791
943669
890573
1404550
251605
1767912
1710884
842806
190650
250772
1563619
1630016
73565
1814167
1925080
1125645
977990
1400861
232953
1970667
1535938
118043
265986
1039528
477972
7904

120307
1504618
1013602
1882107
1057057
18610
1894581
1002028
1922603
1140997
1441984
1373778
1370270
705315
354799
315783
988402
1280753
452712
377179
1193137
118982
1747209
744929
305369
1022838
1081040
879905
1767018
1135964
871510
952868
1838611
1196188
637534
1424377
209160
96082
119753
941907
1082868
966916
1469240
1673227
539091
1110407
367152
1714124
1023161
1718507
339699
201321
382440
1638371
83190
1135376
154368
458065
126957
1048539
961326
234060
591148
1112069
1594235
951747
620157
352427
552222
1696279
115060
843531
80662
75186
1921014
1393131
698573
1926584
1319187
1439340
986773
1436841
1250971
1208702
1375333
1071511
1392234
1351507
1937098
1679227
1403160
433770
844597
1167412
1447618
1275282
1900507
1143147
860253
49842
200534
1631061
194604
1407717
1380312
605027
1284275
350802
334512
1614410
50240
1074230
944987
175187
1899844
147270
1555736
231617
567778
284864
457922
363285
35386
1749703
1907040
198858
1771568
1755186
1584072
1679655
1389965
988897
1039943
1612603

1618106
1506731
783594
1172333
357084
1167754
1816957
207266
264644
1919444
767037
1097326
996089
498908
1555037
905111
1014984
665998
1608367
953258
403181
1131549
381834
1847559
1719221
1878672
556369
907105
1687710
1648480
163631
831539
677926
1173330
1898457
1805815
164061
1344847
745291
954938
1418082
1052345
168268
910041
786452
873898
652297
1451782
1735567
1938488
1925501
1540376
1942161
1076535
1278755
58684
80318
653654
77381
1962360
616037
1723739
1173804
1980565
1564248
447827
696346
806465
1467022
1069380
1557189
904769
1198350
3393
1580150
156765
1251307
1010504
544340
705136
1902044
371680
1895779
1718540
1370957
1762089
1117229
1455462
1586201
130000
947141
1258794
217267
1102454
1069597
849740
955621
1818525
1126835
538846
947459
26873
1790123
1195260
1701297
1900447
1928879
1555418
456318
721372
176328
1131887
1509528
705821
1310718
1095131
988216
86819
588950
1368181
1842473
296654
529983
1533358
1091479
215447
1883177
191452
1011952
1598387
621660
1291651
462936
193

954916
121550
1429542
1687780
589333
1341781
1303146
1028334
916491
20266
974966
163375
590826
433077
1388459
729214
653769
1451157
873053
1143764
1158538
187549
556796
627902
807469
1928123
1495341
670767
1493457
147907
1029224
362458
882859
172182
153009
1635935
969453
1240578
1369021
1893838
527545
1609331
1378517
1171056
615360
346242
975763
784056
1873089
1498444
1459534
1634677
926607
100788
1125300
547835
863988
1392872
170146
571735
1071563
621618
834751
1157872
1324036
338013
973345
1826133
389349
1814064
269016
1932760
650979
1151770
403391
295665
1841837
72051
434449
547721
1888947
1948316
1695740
1954605
254910
387713
550187
1667464
688595
1226802
1226277
1379052
866806
174961
1247286
1119067
1906578
1922473
1107964
1465335
1854862
1370839
953435
1742360
1585216
347448
534143
1078786
1941434
101474
879749
632540
1800915
1036203
376761
1922235
934322
1657909
76476
726716
960773
881733
75571
715393
1297663
677658
878020
896090
1994820
406709
1690852
145313
1250388
679170
1043

808933
1480139
872946
1221048
153115
455723
678551
1118265
1965866
529505
1099919
1572376
922780
1754834
134000
1832717
824490
1158744
1295672
1396602
1112212
1608226
926257
121194
339840
657771
923487
1087785
726073
1923261
6452
628549
1435892
458732
1104073
1324253
609163
748123
1640348
1787201
1050484
12425
227041
755453
683457
643277
18893
860970
1615489
12152
510285
1951819
119486
1030505
1959054
1856379
993474
377975
266250
1293570
1542848
1724817
1140567
1124057
1019270
365642
1086390
854129
1436029
1603058
42589
1176762
530347
589787
998996
454546
758188
483313
859270
1809785
250055
588581
1579872
116046
1862835
1599313
708146
363648
346507
274256
178893
1107133
1363673
1354825
1635263
1346278
630354
1644106
1582226
1382084
1206389
28263
1597305
389004
1289127
1417991
1126462
432950
473043
1963603
50717
303507
1605788
1655807
231328
307858
1941732
146839
1096942
1951131
1568875
357337
996210
1433034
1278706
182540
1296513
1775377
1181906
632675
215319
925097
17690
1292983
52556

259157
199439
870461
569867
1669600
1631603
1335349
1864560
1826217
1434138
1390635
919697
795604
1275064
168886
1903282
1893587
1335229
1269172
1369478
1690402
151483
1061602
1305124
471895
1823590
71419
1708405
1051662
1534364
376848
1836777
1292613
1910035
1735823
646075
1431268
22824
1186715
961352
1172731
81784
1484135
489475
1204880
326738
934303
158243
1678439
1754914
1986627
1816182
1148436
1259417
1197584
271305
1341531
1949793
1116864
171478
1594114
229333
72362
1095087
512032
377165
1979266
569717
728671
1371691
139750
1010239
1165267
43557
552053
1789453
1232980
1173532
1394115
914315
514554
892327
1635959
1917835
1803642
108203
1966824
1576763
577260
777741
537231
911048
209292
1379592
833386
423004
900849
1176959
859651
395634
590413
1184700
1167517
899049
813333
172025
153196
1652126
296240
654023
1283088
1878955
1815322
554321
1588467
1703789
1299776
1718328
1551311
1298859
1513108
306586
1144015
1682247
1388017
1607669
1694921
934128
699826
543328
286010
849501
1187324

In [178]:
from collections import defaultdict

def list_duplicates(seq, x):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return (locs for key,locs in tally.items() 
            if key==x)

'''
ind=6
p=list_duplicates(ti,ti[ind])
#for l in p:
 #   print l
j=np.zeros(300)
print source_matrix[ind]
print source_matrix[191]
for l in p:
    for x in l:
        j+=source_matrix[x]
            
    target_matrix[ind]=j[:]/len(l)
print target_matrix[ind]
'''

'\nind=6\np=list_duplicates(ti,ti[ind])\n#for l in p:\n #   print l\nj=np.zeros(300)\nprint source_matrix[ind]\nprint source_matrix[191]\nfor l in p:\n    for x in l:\n        j+=source_matrix[x]\n            \n    target_matrix[ind]=j[:]/len(l)\nprint target_matrix[ind]\n'

In [179]:
import copy
count_no=0
j=np.zeros(300)
for r in range(0,ct1):
    #print source_matrix[r], target_matrix[r]
    #print len(source_matrix[r]),len(target_matrix[r])
    p=list_duplicates(ti,ti[r])
    j=np.zeros(300)
    for l in p:
        for x in l:
            j+=source_matrix[x]
            
        target_matrix[r]=j[:]/len(l)
    count_no+=1
    #target_matrix[r]=source_matrix[r][:]
    mi_dictionary.apply_cop(target_matrix[r],ti[r])
for r in range (ct1, len(ti)):
    target_matrix[r]=source_matrix[r]
    mi_dictionary.apply_cop(target_matrix[r],ti[r])
print count_no

962


In [180]:
mi_dictionary.export('/home/apatra/Desktop/work/lstm/data/micmaq12.vec')

In [181]:
mi_dictionary.export('/home/apatra/fastText/fastText_multilingual-master/micmaq12.vec')