# Word Embedding for Sequence Processing

**The goal of this practical is to use pre-trained word embedding for adressing the sequence prediction tasks studied in week 2: PoS and chunking.**

In [2]:
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors

## 0) Loading PoS (or chunking) datasets (small or large)

In [3]:
def load(filename):
    listeDoc = list()
    with open(filename, "r") as f:
        doc = list()
        for ligne in f:
            #print "l : ",len(ligne)," ",ligne
            if len(ligne) < 2: # fin de doc
                listeDoc.append(doc)
                doc = list()
                continue
            mots = ligne.replace("\n","").split(" ")
            doc.append((mots[0],mots[2])) # mettre mots[2] à la place de mots[1] pour le chuncking
    return listeDoc

In [4]:
bSmall = False

if(bSmall==True):
    filename = "datasets/conll2000/chtrain.txt" 
    filenameT = "datasets/conll2000/chtest.txt" 

else:
    # Larger corpus .
    filename = "datasets/conll2000/train.txt" 
    filenameT = "datasets/conll2000/test.txt" 

alldocs = load(filename)
alldocsT = load(filenameT)

print(len(alldocs)," docs read")
print(len(alldocsT)," docs (T) read")

8936  docs read
2012  docs (T) read


# 1) Word embedding for classifying each word

### Pre-trained word2vec

In [5]:
import gensim.downloader as api
bload = True
fname = "word2vec-google-news-300"
sdir = "" 

if(bload==True):
    wv_pre_trained = KeyedVectors.load(sdir+fname+".dat")
else:  
    wv_pre_trained = api.load(fname)
    wv_pre_trained.save(sdir+fname+".dat")

### Some token on the dataset are missing, we will encode them with a random vector
This is sub-optimal, but we need to do something

In [6]:
def randomvec():
    
    default = np.random.randn(300)
    default = default  / np.linalg.norm(default)
    return default

In [7]:
np.random.seed(seed=10) # seed the randomness

dictadd = dict()
cpt=0
for d in alldocs:
    cpt+=1
    print(" ****** Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            
for d in alldocsT:
    cpt+=1
    print(" ****** TEST Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            #wv_pre_trained.add_vector(x,randomvec())
            

 ****** Document ****** 1
to  not in WE, adding it with random vector
,  not in WE, adding it with random vector
a  not in WE, adding it with random vector
and  not in WE, adding it with random vector
's  not in WE, adding it with random vector
near-record  not in WE, adding it with random vector
.  not in WE, adding it with random vector
 ****** Document ****** 2
of  not in WE, adding it with random vector
 ****** Document ****** 3
 ****** Document ****** 4
16  not in WE, adding it with random vector
15  not in WE, adding it with random vector
 ****** Document ****** 5
``  not in WE, adding it with random vector
''  not in WE, adding it with random vector
 ****** Document ****** 6
 ****** Document ****** 7
-LRB-  not in WE, adding it with random vector
3.2  not in WE, adding it with random vector
-RRB-  not in WE, adding it with random vector
 ****** Document ****** 8
2.2  not in WE, adding it with random vector
2.3  not in WE, adding it with random vector
1988  not in WE, adding it w

266.66  not in WE, adding it with random vector
 ****** Document ****** 146
101.98  not in WE, adding it with random vector
35588.36  not in WE, adding it with random vector
 ****** Document ****** 147
862  not in WE, adding it with random vector
 ****** Document ****** 148
572  not in WE, adding it with random vector
368  not in WE, adding it with random vector
181  not in WE, adding it with random vector
 ****** Document ****** 149
small-lot  not in WE, adding it with random vector
 ****** Document ****** 150
35611.38  not in WE, adding it with random vector
profit-taking  not in WE, adding it with random vector
 ****** Document ****** 151
Dai-ichi  not in WE, adding it with random vector
 ****** Document ****** 152
22.78  not in WE, adding it with random vector
14.06  not in WE, adding it with random vector
0.53  not in WE, adding it with random vector
2679.72  not in WE, adding it with random vector
 ****** Document ****** 153
15.72  not in WE, adding it with random vector
11.88  n

 ****** Document ****** 276
previous-month  not in WE, adding it with random vector
 ****** Document ****** 277
 ****** Document ****** 278
 ****** Document ****** 279
one-day  not in WE, adding it with random vector
190-point  not in WE, adding it with random vector
 ****** Document ****** 280
 ****** Document ****** 281
 ****** Document ****** 282
fourth-quarter  not in WE, adding it with random vector
 ****** Document ****** 283
3.3  not in WE, adding it with random vector
 ****** Document ****** 284
 ****** Document ****** 285
 ****** Document ****** 286
 ****** Document ****** 287
Fleet\/Norstar  not in WE, adding it with random vector
3.7  not in WE, adding it with random vector
 ****** Document ****** 288
2.1  not in WE, adding it with random vector
 ****** Document ****** 289
stock-market  not in WE, adding it with random vector
 ****** Document ****** 290
0.5  not in WE, adding it with random vector
0.3  not in WE, adding it with random vector
 ****** Document ****** 291
0.4  

 ****** Document ****** 408
 ****** Document ****** 409
 ****** Document ****** 410
 ****** Document ****** 411
 ****** Document ****** 412
 ****** Document ****** 413
cotton-ginning  not in WE, adding it with random vector
 ****** Document ****** 414
 ****** Document ****** 415
1987  not in WE, adding it with random vector
20-year  not in WE, adding it with random vector
 ****** Document ****** 416
 ****** Document ****** 417
 ****** Document ****** 418
 ****** Document ****** 419
 ****** Document ****** 420
1.03  not in WE, adding it with random vector
 ****** Document ****** 421
 ****** Document ****** 422
multibillion-dollar  not in WE, adding it with random vector
agrarian-reform  not in WE, adding it with random vector
 ****** Document ****** 423
 ****** Document ****** 424
 ****** Document ****** 425
 ****** Document ****** 426
 ****** Document ****** 427
 ****** Document ****** 428
 ****** Document ****** 429
government-held  not in WE, adding it with random vector
1986  not in

 ****** Document ****** 594
 ****** Document ****** 595
 ****** Document ****** 596
oil-producing  not in WE, adding it with random vector
476.5  not in WE, adding it with random vector
 ****** Document ****** 597
 ****** Document ****** 598
 ****** Document ****** 599
 ****** Document ****** 600
 ****** Document ****** 601
164,830,000  not in WE, adding it with random vector
 ****** Document ****** 602
1230.80  not in WE, adding it with random vector
32.71  not in WE, adding it with random vector
215.48  not in WE, adding it with random vector
0.06  not in WE, adding it with random vector
 ****** Document ****** 603
 ****** Document ****** 604
3392.49  not in WE, adding it with random vector
 ****** Document ****** 605
 ****** Document ****** 606
129.62  not in WE, adding it with random vector
0.51  not in WE, adding it with random vector
131.34  not in WE, adding it with random vector
0.88  not in WE, adding it with random vector
 ****** Document ****** 607
 ****** Document ****** 60

 ****** Document ****** 708
 ****** Document ****** 709
 ****** Document ****** 710
 ****** Document ****** 711
 ****** Document ****** 712
 ****** Document ****** 713
 ****** Document ****** 714
 ****** Document ****** 715
 ****** Document ****** 716
 ****** Document ****** 717
 ****** Document ****** 718
two-hour  not in WE, adding it with random vector
 ****** Document ****** 719
 ****** Document ****** 720
 ****** Document ****** 721
 ****** Document ****** 722
 ****** Document ****** 723
 ****** Document ****** 724
 ****** Document ****** 725
 ****** Document ****** 726
 ****** Document ****** 727
anti-Soviet  not in WE, adding it with random vector
1953  not in WE, adding it with random vector
 ****** Document ****** 728
 ****** Document ****** 729
 ****** Document ****** 730
 ****** Document ****** 731
 ****** Document ****** 732
out-of-touch  not in WE, adding it with random vector
 ****** Document ****** 733
 ****** Document ****** 734
30-year-old  not in WE, adding it with ra

 ****** Document ****** 967
 ****** Document ****** 968
Vizas  not in WE, adding it with random vector
 ****** Document ****** 969
well-known  not in WE, adding it with random vector
 ****** Document ****** 970
 ****** Document ****** 971
 ****** Document ****** 972
 ****** Document ****** 973
 ****** Document ****** 974
 ****** Document ****** 975
 ****** Document ****** 976
 ****** Document ****** 977
 ****** Document ****** 978
 ****** Document ****** 979
 ****** Document ****** 980
 ****** Document ****** 981
 ****** Document ****** 982
 ****** Document ****** 983
 ****** Document ****** 984
 ****** Document ****** 985
hot-dipped  not in WE, adding it with random vector
 ****** Document ****** 986
600,000  not in WE, adding it with random vector
 ****** Document ****** 987
 ****** Document ****** 988
corrosion-resistant  not in WE, adding it with random vector
 ****** Document ****** 989
 ****** Document ****** 990
1992  not in WE, adding it with random vector
 ****** Document ****

Mercedes-Benzes  not in WE, adding it with random vector
 ****** Document ****** 1272
Neiman-Marcus  not in WE, adding it with random vector
 ****** Document ****** 1273
marble-encased  not in WE, adding it with random vector
 ****** Document ****** 1274
 ****** Document ****** 1275
 ****** Document ****** 1276
 ****** Document ****** 1277
Law-enforcement  not in WE, adding it with random vector
 ****** Document ****** 1278
 ****** Document ****** 1279
 ****** Document ****** 1280
 ****** Document ****** 1281
70,000  not in WE, adding it with random vector
 ****** Document ****** 1282
 ****** Document ****** 1283
low-lifes  not in WE, adding it with random vector
 ****** Document ****** 1284
 ****** Document ****** 1285
 ****** Document ****** 1286
 ****** Document ****** 1287
 ****** Document ****** 1288
 ****** Document ****** 1289
 ****** Document ****** 1290
 ****** Document ****** 1291
 ****** Document ****** 1292
 ****** Document ****** 1293
 ****** Document ****** 1294
547,000  

do-gooder  not in WE, adding it with random vector
 ****** Document ****** 1548
 ****** Document ****** 1549
 ****** Document ****** 1550
 ****** Document ****** 1551
 ****** Document ****** 1552
 ****** Document ****** 1553
 ****** Document ****** 1554
Stuart-James  not in WE, adding it with random vector
 ****** Document ****** 1555
Lyneses  not in WE, adding it with random vector
 ****** Document ****** 1556
 ****** Document ****** 1557
 ****** Document ****** 1558
 ****** Document ****** 1559
penny-brokerage  not in WE, adding it with random vector
 ****** Document ****** 1560
major-frauds  not in WE, adding it with random vector
 ****** Document ****** 1561
 ****** Document ****** 1562
 ****** Document ****** 1563
 ****** Document ****** 1564
flim-flam  not in WE, adding it with random vector
 ****** Document ****** 1565
Elvekrog  not in WE, adding it with random vector
Seger-Elvekrog  not in WE, adding it with random vector
investment-counseling  not in WE, adding it with random 

 ****** Document ****** 1814
 ****** Document ****** 1815
business-class  not in WE, adding it with random vector
first-class  not in WE, adding it with random vector
 ****** Document ****** 1816
 ****** Document ****** 1817
 ****** Document ****** 1818
 ****** Document ****** 1819
 ****** Document ****** 1820
 ****** Document ****** 1821
 ****** Document ****** 1822
 ****** Document ****** 1823
 ****** Document ****** 1824
 ****** Document ****** 1825
 ****** Document ****** 1826
 ****** Document ****** 1827
 ****** Document ****** 1828
one-way  not in WE, adding it with random vector
109  not in WE, adding it with random vector
 ****** Document ****** 1829
code-named  not in WE, adding it with random vector
 ****** Document ****** 1830
advance-purchase  not in WE, adding it with random vector
 ****** Document ****** 1831
hurricane-stricken  not in WE, adding it with random vector
money-back  not in WE, adding it with random vector
 ****** Document ****** 1832
 ****** Document ****** 

 ****** Document ****** 2083
40.1  not in WE, adding it with random vector
 ****** Document ****** 2084
 ****** Document ****** 2085
FDA-approved  not in WE, adding it with random vector
 ****** Document ****** 2086
 ****** Document ****** 2087
 ****** Document ****** 2088
 ****** Document ****** 2089
USACafes  not in WE, adding it with random vector
 ****** Document ****** 2090
half-owned  not in WE, adding it with random vector
 ****** Document ****** 2091
600  not in WE, adding it with random vector
 ****** Document ****** 2092
class-action  not in WE, adding it with random vector
 ****** Document ****** 2093
 ****** Document ****** 2094
early-retirement  not in WE, adding it with random vector
 ****** Document ****** 2095
AT&T  not in WE, adding it with random vector
 ****** Document ****** 2096
34,000  not in WE, adding it with random vector
 ****** Document ****** 2097
 ****** Document ****** 2098
 ****** Document ****** 2099
587  not in WE, adding it with random vector
 ****** D

75,000  not in WE, adding it with random vector
 ****** Document ****** 2368
 ****** Document ****** 2369
 ****** Document ****** 2370
month-to-month  not in WE, adding it with random vector
 ****** Document ****** 2371
 ****** Document ****** 2372
1963  not in WE, adding it with random vector
180.9  not in WE, adding it with random vector
 ****** Document ****** 2373
 ****** Document ****** 2374
331.8  not in WE, adding it with random vector
273.9  not in WE, adding it with random vector
 ****** Document ****** 2375
 ****** Document ****** 2376
 ****** Document ****** 2377
5.23  not in WE, adding it with random vector
 ****** Document ****** 2378
110  not in WE, adding it with random vector
507  not in WE, adding it with random vector
 ****** Document ****** 2379
 ****** Document ****** 2380
 ****** Document ****** 2381
240.8  not in WE, adding it with random vector
 ****** Document ****** 2382
 ****** Document ****** 2383
 ****** Document ****** 2384
 ****** Document ****** 2385
47.6

 ****** Document ****** 2601
 ****** Document ****** 2602
 ****** Document ****** 2603
 ****** Document ****** 2604
 ****** Document ****** 2605
 ****** Document ****** 2606
 ****** Document ****** 2607
time-strapped  not in WE, adding it with random vector
 ****** Document ****** 2608
 ****** Document ****** 2609
 ****** Document ****** 2610
hot-air  not in WE, adding it with random vector
 ****** Document ****** 2611
 ****** Document ****** 2612
21.7  not in WE, adding it with random vector
 ****** Document ****** 2613
 ****** Document ****** 2614
 ****** Document ****** 2615
 ****** Document ****** 2616
warm-weather  not in WE, adding it with random vector
 ****** Document ****** 2617
 ****** Document ****** 2618
 ****** Document ****** 2619
 ****** Document ****** 2620
 ****** Document ****** 2621
 ****** Document ****** 2622
 ****** Document ****** 2623
 ****** Document ****** 2624
 ****** Document ****** 2625
 ****** Document ****** 2626
 ****** Document ****** 2627
60,000-odd  n

52-week  not in WE, adding it with random vector
 ****** Document ****** 2902
7.35  not in WE, adding it with random vector
 ****** Document ****** 2903
7.22  not in WE, adding it with random vector
 ****** Document ****** 2904
 ****** Document ****** 2905
 ****** Document ****** 2906
 ****** Document ****** 2907
360-day  not in WE, adding it with random vector
coupon-equivalent  not in WE, adding it with random vector
365-day  not in WE, adding it with random vector
 ****** Document ****** 2908
 ****** Document ****** 2909
 ****** Document ****** 2910
one-point  not in WE, adding it with random vector
 ****** Document ****** 2911
2001  not in WE, adding it with random vector
 ****** Document ****** 2912
Investment-grade  not in WE, adding it with random vector
 ****** Document ****** 2913
 ****** Document ****** 2914
 ****** Document ****** 2915
140  not in WE, adding it with random vector
bid-wanted  not in WE, adding it with random vector
 ****** Document ****** 2916
 ****** Documen

 ****** Document ****** 3165
government-plus  not in WE, adding it with random vector
 ****** Document ****** 3166
 ****** Document ****** 3167
 ****** Document ****** 3168
 ****** Document ****** 3169
 ****** Document ****** 3170
 ****** Document ****** 3171
 ****** Document ****** 3172
 ****** Document ****** 3173
 ****** Document ****** 3174
 ****** Document ****** 3175
 ****** Document ****** 3176
no-loads  not in WE, adding it with random vector
 ****** Document ****** 3177
 ****** Document ****** 3178
exit-load  not in WE, adding it with random vector
 ****** Document ****** 3179
 ****** Document ****** 3180
 ****** Document ****** 3181
 ****** Document ****** 3182
 ****** Document ****** 3183
 ****** Document ****** 3184
 ****** Document ****** 3185
time-honored  not in WE, adding it with random vector
 ****** Document ****** 3186
 ****** Document ****** 3187
 ****** Document ****** 3188
 ****** Document ****** 3189
 ****** Document ****** 3190
less-developed  not in WE, adding 

 ****** Document ****** 3488
 ****** Document ****** 3489
 ****** Document ****** 3490
 ****** Document ****** 3491
 ****** Document ****** 3492
 ****** Document ****** 3493
 ****** Document ****** 3494
 ****** Document ****** 3495
 ****** Document ****** 3496
 ****** Document ****** 3497
computer-maintenance  not in WE, adding it with random vector
 ****** Document ****** 3498
 ****** Document ****** 3499
 ****** Document ****** 3500
 ****** Document ****** 3501
Micronyx  not in WE, adding it with random vector
computer-security  not in WE, adding it with random vector
 ****** Document ****** 3502
 ****** Document ****** 3503
 ****** Document ****** 3504
 ****** Document ****** 3505
 ****** Document ****** 3506
Ciba-Geigy  not in WE, adding it with random vector
 ****** Document ****** 3507
 ****** Document ****** 3508
 ****** Document ****** 3509
C$  not in WE, adding it with random vector
866  not in WE, adding it with random vector
942  not in WE, adding it with random vector
 ****

 ****** Document ****** 3768
 ****** Document ****** 3769
 ****** Document ****** 3770
 ****** Document ****** 3771
 ****** Document ****** 3772
 ****** Document ****** 3773
 ****** Document ****** 3774
 ****** Document ****** 3775
BetaWest  not in WE, adding it with random vector
 ****** Document ****** 3776
4.3  not in WE, adding it with random vector
consumer-telephone  not in WE, adding it with random vector
 ****** Document ****** 3777
business-telephone  not in WE, adding it with random vector
618.9  not in WE, adding it with random vector
599.4  not in WE, adding it with random vector
 ****** Document ****** 3778
 ****** Document ****** 3779
12.1  not in WE, adding it with random vector
 ****** Document ****** 3780
 ****** Document ****** 3781
 ****** Document ****** 3782
 ****** Document ****** 3783
 ****** Document ****** 3784
11.2  not in WE, adding it with random vector
664.3  not in WE, adding it with random vector
747.7  not in WE, adding it with random vector
 ****** Docu

lower-income  not in WE, adding it with random vector
 ****** Document ****** 4021
 ****** Document ****** 4022
 ****** Document ****** 4023
 ****** Document ****** 4024
 ****** Document ****** 4025
 ****** Document ****** 4026
165  not in WE, adding it with random vector
 ****** Document ****** 4027
 ****** Document ****** 4028
 ****** Document ****** 4029
 ****** Document ****** 4030
108  not in WE, adding it with random vector
1.87  not in WE, adding it with random vector
 ****** Document ****** 4031
2.58  not in WE, adding it with random vector
2.74  not in WE, adding it with random vector
 ****** Document ****** 4032
 ****** Document ****** 4033
950  not in WE, adding it with random vector
616  not in WE, adding it with random vector
 ****** Document ****** 4034
 ****** Document ****** 4035
 ****** Document ****** 4036
 ****** Document ****** 4037
390  not in WE, adding it with random vector
3.85  not in WE, adding it with random vector
453  not in WE, adding it with random vector

13,865,000  not in WE, adding it with random vector
 ****** Document ****** 4267
6.90  not in WE, adding it with random vector
 ****** Document ****** 4268
triple-A  not in WE, adding it with random vector
 ****** Document ****** 4269
 ****** Document ****** 4270
 ****** Document ****** 4271
 ****** Document ****** 4272
 ****** Document ****** 4273
8.70  not in WE, adding it with random vector
10.37  not in WE, adding it with random vector
 ****** Document ****** 4274
32.6  not in WE, adding it with random vector
46.5  not in WE, adding it with random vector
 ****** Document ****** 4275
1995  not in WE, adding it with random vector
 ****** Document ****** 4276
 ****** Document ****** 4277
9.13  not in WE, adding it with random vector
 ****** Document ****** 4278
101.90  not in WE, adding it with random vector
16.59  not in WE, adding it with random vector
 ****** Document ****** 4279
 ****** Document ****** 4280
 ****** Document ****** 4281
0.75  not in WE, adding it with random vector

 ****** Document ****** 4570
 ****** Document ****** 4571
 ****** Document ****** 4572
 ****** Document ****** 4573
 ****** Document ****** 4574
 ****** Document ****** 4575
 ****** Document ****** 4576
 ****** Document ****** 4577
 ****** Document ****** 4578
federal-local  not in WE, adding it with random vector
 ****** Document ****** 4579
 ****** Document ****** 4580
 ****** Document ****** 4581
disaster-prone  not in WE, adding it with random vector
 ****** Document ****** 4582
 ****** Document ****** 4583
 ****** Document ****** 4584
 ****** Document ****** 4585
in-house  not in WE, adding it with random vector
 ****** Document ****** 4586
 ****** Document ****** 4587
 ****** Document ****** 4588
 ****** Document ****** 4589
Self-sufficiency  not in WE, adding it with random vector
Masaki-Schatz  not in WE, adding it with random vector
 ****** Document ****** 4590
 ****** Document ****** 4591
three-page  not in WE, adding it with random vector
 ****** Document ****** 4592
 ******

 ****** Document ****** 4873
 ****** Document ****** 4874
 ****** Document ****** 4875
 ****** Document ****** 4876
 ****** Document ****** 4877
 ****** Document ****** 4878
 ****** Document ****** 4879
35,000  not in WE, adding it with random vector
 ****** Document ****** 4880
 ****** Document ****** 4881
 ****** Document ****** 4882
 ****** Document ****** 4883
 ****** Document ****** 4884
highway-relief  not in WE, adding it with random vector
 ****** Document ****** 4885
 ****** Document ****** 4886
 ****** Document ****** 4887
 ****** Document ****** 4888
 ****** Document ****** 4889
800-462-9029  not in WE, adding it with random vector
 ****** Document ****** 4890
 ****** Document ****** 4891
 ****** Document ****** 4892
 ****** Document ****** 4893
 ****** Document ****** 4894
 ****** Document ****** 4895
 ****** Document ****** 4896
 ****** Document ****** 4897
 ****** Document ****** 4898
 ****** Document ****** 4899
 ****** Document ****** 4900
 ****** Document ****** 4901
d

 ****** Document ****** 5149
1,400  not in WE, adding it with random vector
optical-disk  not in WE, adding it with random vector
 ****** Document ****** 5150
laser-read  not in WE, adding it with random vector
 ****** Document ****** 5151
 ****** Document ****** 5152
 ****** Document ****** 5153
videodisks  not in WE, adding it with random vector
videodisk  not in WE, adding it with random vector
 ****** Document ****** 5154
 ****** Document ****** 5155
 ****** Document ****** 5156
 ****** Document ****** 5157
 ****** Document ****** 5158
 ****** Document ****** 5159
 ****** Document ****** 5160
 ****** Document ****** 5161
111.48  not in WE, adding it with random vector
0.76  not in WE, adding it with random vector
35374.22  not in WE, adding it with random vector
 ****** Document ****** 5162
841  not in WE, adding it with random vector
 ****** Document ****** 5163
645-293  not in WE, adding it with random vector
186  not in WE, adding it with random vector
 ****** Document ****** 51

 ****** Document ****** 5419
 ****** Document ****** 5420
 ****** Document ****** 5421
899.6  not in WE, adding it with random vector
 ****** Document ****** 5422
100.2  not in WE, adding it with random vector
1.90  not in WE, adding it with random vector
 ****** Document ****** 5423
 ****** Document ****** 5424
 ****** Document ****** 5425
145,954  not in WE, adding it with random vector
 ****** Document ****** 5426
37.50  not in WE, adding it with random vector
 ****** Document ****** 5427
 ****** Document ****** 5428
4.92  not in WE, adding it with random vector
 ****** Document ****** 5429
 ****** Document ****** 5430
 ****** Document ****** 5431
25.50  not in WE, adding it with random vector
 ****** Document ****** 5432
 ****** Document ****** 5433
PATOIS  not in WE, adding it with random vector
 ****** Document ****** 5434
1973  not in WE, adding it with random vector
 ****** Document ****** 5435
 ****** Document ****** 5436
livestock-dealing  not in WE, adding it with random vec

614.6  not in WE, adding it with random vector
 ****** Document ****** 5638
 ****** Document ****** 5639
9.2  not in WE, adding it with random vector
 ****** Document ****** 5640
9.37  not in WE, adding it with random vector
 ****** Document ****** 5641
 ****** Document ****** 5642
 ****** Document ****** 5643
 ****** Document ****** 5644
 ****** Document ****** 5645
 ****** Document ****** 5646
 ****** Document ****** 5647
 ****** Document ****** 5648
5.25  not in WE, adding it with random vector
 ****** Document ****** 5649
71%-owned  not in WE, adding it with random vector
5.1  not in WE, adding it with random vector
 ****** Document ****** 5650
Dutch\/Shell  not in WE, adding it with random vector
 ****** Document ****** 5651
 ****** Document ****** 5652
 ****** Document ****** 5653
steam-generating  not in WE, adding it with random vector
Verfahrenstechnik  not in WE, adding it with random vector
Dutch-based  not in WE, adding it with random vector
N.V  not in WE, adding it with r

 ****** Document ****** 5944
XR4Ti  not in WE, adding it with random vector
 ****** Document ****** 5945
double-wing  not in WE, adding it with random vector
 ****** Document ****** 5946
 ****** Document ****** 5947
Merkurs  not in WE, adding it with random vector
15,261  not in WE, adding it with random vector
 ****** Document ****** 5948
 ****** Document ****** 5949
 ****** Document ****** 5950
 ****** Document ****** 5951
 ****** Document ****** 5952
Lincoln-Mercury-Merkur  not in WE, adding it with random vector
 ****** Document ****** 5953
 ****** Document ****** 5954
 ****** Document ****** 5955
 ****** Document ****** 5956
 ****** Document ****** 5957
242  not in WE, adding it with random vector
4,600  not in WE, adding it with random vector
 ****** Document ****** 5958
81.9  not in WE, adding it with random vector
 ****** Document ****** 5959
 ****** Document ****** 5960
Minneapolis-based  not in WE, adding it with random vector
 ****** Document ****** 5961
 ****** Document ***

338  not in WE, adding it with random vector
3.41  not in WE, adding it with random vector
 ****** Document ****** 6214
7.73  not in WE, adding it with random vector
6.94  not in WE, adding it with random vector
 ****** Document ****** 6215
 ****** Document ****** 6216
 ****** Document ****** 6217
23-5  not in WE, adding it with random vector
 ****** Document ****** 6218
 ****** Document ****** 6219
 ****** Document ****** 6220
 ****** Document ****** 6221
 ****** Document ****** 6222
stepped-up  not in WE, adding it with random vector
 ****** Document ****** 6223
 ****** Document ****** 6224
 ****** Document ****** 6225
 ****** Document ****** 6226
 ****** Document ****** 6227
 ****** Document ****** 6228
 ****** Document ****** 6229
 ****** Document ****** 6230
 ****** Document ****** 6231
 ****** Document ****** 6232
laissez-faire  not in WE, adding it with random vector
 ****** Document ****** 6233
deregulaton  not in WE, adding it with random vector
 ****** Document ****** 6234
 *

 ****** Document ****** 6502
 ****** Document ****** 6503
 ****** Document ****** 6504
 ****** Document ****** 6505
 ****** Document ****** 6506
high-profile  not in WE, adding it with random vector
 ****** Document ****** 6507
 ****** Document ****** 6508
 ****** Document ****** 6509
 ****** Document ****** 6510
 ****** Document ****** 6511
 ****** Document ****** 6512
 ****** Document ****** 6513
230-a-share  not in WE, adding it with random vector
 ****** Document ****** 6514
 ****** Document ****** 6515
 ****** Document ****** 6516
 ****** Document ****** 6517
20%-plus  not in WE, adding it with random vector
 ****** Document ****** 6518
 ****** Document ****** 6519
 ****** Document ****** 6520
 ****** Document ****** 6521
 ****** Document ****** 6522
 ****** Document ****** 6523
 ****** Document ****** 6524
 ****** Document ****** 6525
 ****** Document ****** 6526
 ****** Document ****** 6527
 ****** Document ****** 6528
 ****** Document ****** 6529
 ****** Document ****** 6530
 *

 ****** Document ****** 6819
 ****** Document ****** 6820
 ****** Document ****** 6821
18th-century  not in WE, adding it with random vector
 ****** Document ****** 6822
 ****** Document ****** 6823
9.29  not in WE, adding it with random vector
 ****** Document ****** 6824
 ****** Document ****** 6825
 ****** Document ****** 6826
2-for-1  not in WE, adding it with random vector
 ****** Document ****** 6827
 ****** Document ****** 6828
 ****** Document ****** 6829
 ****** Document ****** 6830
 ****** Document ****** 6831
36.6  not in WE, adding it with random vector
 ****** Document ****** 6832
 ****** Document ****** 6833
1.09  not in WE, adding it with random vector
122.4  not in WE, adding it with random vector
 ****** Document ****** 6834
 ****** Document ****** 6835
32.125  not in WE, adding it with random vector
 ****** Document ****** 6836
coming-out  not in WE, adding it with random vector
 ****** Document ****** 6837
 ****** Document ****** 6838
long-planned  not in WE, adding 

 ****** Document ****** 7097
 ****** Document ****** 7098
 ****** Document ****** 7099
 ****** Document ****** 7100
 ****** Document ****** 7101
 ****** Document ****** 7102
6,000  not in WE, adding it with random vector
 ****** Document ****** 7103
 ****** Document ****** 7104
 ****** Document ****** 7105
 ****** Document ****** 7106
 ****** Document ****** 7107
 ****** Document ****** 7108
 ****** Document ****** 7109
110-story  not in WE, adding it with random vector
 ****** Document ****** 7110
anti-Japanese  not in WE, adding it with random vector
 ****** Document ****** 7111
 ****** Document ****** 7112
 ****** Document ****** 7113
 ****** Document ****** 7114
 ****** Document ****** 7115
 ****** Document ****** 7116
 ****** Document ****** 7117
Datatronic  not in WE, adding it with random vector
 ****** Document ****** 7118
 ****** Document ****** 7119
hand-held  not in WE, adding it with random vector
 ****** Document ****** 7120
 ****** Document ****** 7121
 ****** Document **

 ****** Document ****** 7384
 ****** Document ****** 7385
 ****** Document ****** 7386
R2-D2  not in WE, adding it with random vector
 ****** Document ****** 7387
3,390  not in WE, adding it with random vector
 ****** Document ****** 7388
1890s  not in WE, adding it with random vector
 ****** Document ****** 7389
 ****** Document ****** 7390
 ****** Document ****** 7391
 ****** Document ****** 7392
 ****** Document ****** 7393
 ****** Document ****** 7394
 ****** Document ****** 7395
 ****** Document ****** 7396
 ****** Document ****** 7397
 ****** Document ****** 7398
 ****** Document ****** 7399
 ****** Document ****** 7400
 ****** Document ****** 7401
 ****** Document ****** 7402
 ****** Document ****** 7403
 ****** Document ****** 7404
 ****** Document ****** 7405
egg-processing  not in WE, adding it with random vector
 ****** Document ****** 7406
 ****** Document ****** 7407
 ****** Document ****** 7408
 ****** Document ****** 7409
 ****** Document ****** 7410
 ****** Document ***

 ****** Document ****** 7698
 ****** Document ****** 7699
Reagan-Bush  not in WE, adding it with random vector
 ****** Document ****** 7700
 ****** Document ****** 7701
 ****** Document ****** 7702
 ****** Document ****** 7703
 ****** Document ****** 7704
 ****** Document ****** 7705
 ****** Document ****** 7706
non-financial  not in WE, adding it with random vector
 ****** Document ****** 7707
1965  not in WE, adding it with random vector
 ****** Document ****** 7708
 ****** Document ****** 7709
 ****** Document ****** 7710
 ****** Document ****** 7711
 ****** Document ****** 7712
 ****** Document ****** 7713
 ****** Document ****** 7714
 ****** Document ****** 7715
 ****** Document ****** 7716
 ****** Document ****** 7717
 ****** Document ****** 7718
68.9  not in WE, adding it with random vector
 ****** Document ****** 7719
 ****** Document ****** 7720
mid-1980s  not in WE, adding it with random vector
 ****** Document ****** 7721
 ****** Document ****** 7722
 ****** Document ****** 

 ****** Document ****** 8036
405  not in WE, adding it with random vector
 ****** Document ****** 8037
 ****** Document ****** 8038
executive-model  not in WE, adding it with random vector
 ****** Document ****** 8039
 ****** Document ****** 8040
39,400  not in WE, adding it with random vector
16,000  not in WE, adding it with random vector
highest-priced  not in WE, adding it with random vector
hand-crafted  not in WE, adding it with random vector
 ****** Document ****** 8041
 ****** Document ****** 8042
 ****** Document ****** 8043
 ****** Document ****** 8044
auto-industry  not in WE, adding it with random vector
 ****** Document ****** 8045
 ****** Document ****** 8046
 ****** Document ****** 8047
 ****** Document ****** 8048
 ****** Document ****** 8049
takeover-stock  not in WE, adding it with random vector
 ****** Document ****** 8050
full-fledged  not in WE, adding it with random vector
 ****** Document ****** 8051
 ****** Document ****** 8052
 ****** Document ****** 8053
 ****

 ****** Document ****** 8306
 ****** Document ****** 8307
Prizms  not in WE, adding it with random vector
 ****** Document ****** 8308
 ****** Document ****** 8309
 ****** Document ****** 8310
 ****** Document ****** 8311
 ****** Document ****** 8312
 ****** Document ****** 8313
 ****** Document ****** 8314
 ****** Document ****** 8315
disaster-contingency  not in WE, adding it with random vector
 ****** Document ****** 8316
 ****** Document ****** 8317
 ****** Document ****** 8318
dial-tone  not in WE, adding it with random vector
 ****** Document ****** 8319
 ****** Document ****** 8320
 ****** Document ****** 8321
on-ramps  not in WE, adding it with random vector
 ****** Document ****** 8322
 ****** Document ****** 8323
 ****** Document ****** 8324
 ****** Document ****** 8325
double-deck  not in WE, adding it with random vector
 ****** Document ****** 8326
DRI\/McGraw  not in WE, adding it with random vector
 ****** Document ****** 8327
 ****** Document ****** 8328
 ****** Document

 ****** Document ****** 8629
 ****** Document ****** 8630
 ****** Document ****** 8631
 ****** Document ****** 8632
 ****** Document ****** 8633
8.53  not in WE, adding it with random vector
8.48  not in WE, adding it with random vector
8.40  not in WE, adding it with random vector
 ****** Document ****** 8634
8.42  not in WE, adding it with random vector
8.28  not in WE, adding it with random vector
8.15  not in WE, adding it with random vector
 ****** Document ****** 8635
 ****** Document ****** 8636
 ****** Document ****** 8637
 ****** Document ****** 8638
 ****** Document ****** 8639
 ****** Document ****** 8640
 ****** Document ****** 8641
 ****** Document ****** 8642
 ****** Document ****** 8643
 ****** Document ****** 8644
9.88  not in WE, adding it with random vector
 ****** Document ****** 8645
 ****** Document ****** 8646
 ****** Document ****** 8647
9.83  not in WE, adding it with random vector
 ****** Document ****** 8648
 ****** Document ****** 8649
 ****** Document ******

3.34  not in WE, adding it with random vector
228  not in WE, adding it with random vector
 ****** Document ****** 8920
2.59  not in WE, adding it with random vector
 ****** Document ****** 8921
46.1  not in WE, adding it with random vector
53.1  not in WE, adding it with random vector
 ****** Document ****** 8922
251.2  not in WE, adding it with random vector
278.7  not in WE, adding it with random vector
 ****** Document ****** 8923
Atlanta-based  not in WE, adding it with random vector
 ****** Document ****** 8924
 ****** Document ****** 8925
 ****** Document ****** 8926
 ****** Document ****** 8927
46.125  not in WE, adding it with random vector
 ****** Document ****** 8928
temblor-prone  not in WE, adding it with random vector
earthquake-trained  not in WE, adding it with random vector
 ****** Document ****** 8929
 ****** Document ****** 8930
loss-recovery  not in WE, adding it with random vector
 ****** Document ****** 8931
 ****** Document ****** 8932
 ****** Document ****** 893

 ****** TEST Document ****** 9225
 ****** TEST Document ****** 9226
 ****** TEST Document ****** 9227
 ****** TEST Document ****** 9228
 ****** TEST Document ****** 9229
 ****** TEST Document ****** 9230
 ****** TEST Document ****** 9231
 ****** TEST Document ****** 9232
 ****** TEST Document ****** 9233
 ****** TEST Document ****** 9234
 ****** TEST Document ****** 9235
 ****** TEST Document ****** 9236
anti-Noriega  not in WE, adding it with random vector
 ****** TEST Document ****** 9237
 ****** TEST Document ****** 9238
 ****** TEST Document ****** 9239
 ****** TEST Document ****** 9240
 ****** TEST Document ****** 9241
 ****** TEST Document ****** 9242
 ****** TEST Document ****** 9243
 ****** TEST Document ****** 9244
knock-out  not in WE, adding it with random vector
 ****** TEST Document ****** 9245
well-intentioned  not in WE, adding it with random vector
 ****** TEST Document ****** 9246
 ****** TEST Document ****** 9247
 ****** TEST Document ****** 9248
 ****** TEST Document

 ****** TEST Document ****** 9401
 ****** TEST Document ****** 9402
 ****** TEST Document ****** 9403
 ****** TEST Document ****** 9404
 ****** TEST Document ****** 9405
 ****** TEST Document ****** 9406
 ****** TEST Document ****** 9407
 ****** TEST Document ****** 9408
39.9  not in WE, adding it with random vector
 ****** TEST Document ****** 9409
 ****** TEST Document ****** 9410
 ****** TEST Document ****** 9411
 ****** TEST Document ****** 9412
 ****** TEST Document ****** 9413
 ****** TEST Document ****** 9414
 ****** TEST Document ****** 9415
 ****** TEST Document ****** 9416
 ****** TEST Document ****** 9417
soft-drinks  not in WE, adding it with random vector
 ****** TEST Document ****** 9418
 ****** TEST Document ****** 9419
 ****** TEST Document ****** 9420
324.9  not in WE, adding it with random vector
 ****** TEST Document ****** 9421
 ****** TEST Document ****** 9422
93.8  not in WE, adding it with random vector
 ****** TEST Document ****** 9423
2.97  not in WE, adding it

 ****** TEST Document ****** 9591
1,050  not in WE, adding it with random vector
 ****** TEST Document ****** 9592
SIMPLIFYING  not in WE, adding it with random vector
 ****** TEST Document ****** 9593
 ****** TEST Document ****** 9594
 ****** TEST Document ****** 9595
 ****** TEST Document ****** 9596
stripped-down  not in WE, adding it with random vector
 ****** TEST Document ****** 9597
 ****** TEST Document ****** 9598
 ****** TEST Document ****** 9599
 ****** TEST Document ****** 9600
 ****** TEST Document ****** 9601
RAVAGES  not in WE, adding it with random vector
 ****** TEST Document ****** 9602
hurricane-wracked  not in WE, adding it with random vector
 ****** TEST Document ****** 9603
 ****** TEST Document ****** 9604
 ****** TEST Document ****** 9605
 ****** TEST Document ****** 9606
 ****** TEST Document ****** 9607
 ****** TEST Document ****** 9608
 ****** TEST Document ****** 9609
late-payment  not in WE, adding it with random vector
 ****** TEST Document ****** 9610
89-

880,500  not in WE, adding it with random vector
86,500  not in WE, adding it with random vector
2.3125  not in WE, adding it with random vector
2.4375  not in WE, adding it with random vector
 ****** TEST Document ****** 9881
 ****** TEST Document ****** 9882
 ****** TEST Document ****** 9883
 ****** TEST Document ****** 9884
 ****** TEST Document ****** 9885
 ****** TEST Document ****** 9886
 ****** TEST Document ****** 9887
 ****** TEST Document ****** 9888
 ****** TEST Document ****** 9889
 ****** TEST Document ****** 9890
pre-noon  not in WE, adding it with random vector
 ****** TEST Document ****** 9891
 ****** TEST Document ****** 9892
 ****** TEST Document ****** 9893
 ****** TEST Document ****** 9894
 ****** TEST Document ****** 9895
 ****** TEST Document ****** 9896
 ****** TEST Document ****** 9897
 ****** TEST Document ****** 9898
 ****** TEST Document ****** 9899
mortgage-backed  not in WE, adding it with random vector
 ****** TEST Document ****** 9900
bargain-hunting  not

anti-aircraft  not in WE, adding it with random vector
 ****** TEST Document ****** 10170
 ****** TEST Document ****** 10171
 ****** TEST Document ****** 10172
 ****** TEST Document ****** 10173
 ****** TEST Document ****** 10174
 ****** TEST Document ****** 10175
 ****** TEST Document ****** 10176
co-author  not in WE, adding it with random vector
Afghanistan\/Southwest  not in WE, adding it with random vector
 ****** TEST Document ****** 10177
space-based  not in WE, adding it with random vector
 ****** TEST Document ****** 10178
national-security  not in WE, adding it with random vector
 ****** TEST Document ****** 10179
Reaganauts  not in WE, adding it with random vector
 ****** TEST Document ****** 10180
 ****** TEST Document ****** 10181
 ****** TEST Document ****** 10182
 ****** TEST Document ****** 10183
 ****** TEST Document ****** 10184
 ****** TEST Document ****** 10185
national-priority  not in WE, adding it with random vector
 ****** TEST Document ****** 10186
 ****** TEST

 ****** TEST Document ****** 10346
 ****** TEST Document ****** 10347
30-minute  not in WE, adding it with random vector
 ****** TEST Document ****** 10348
futures-trading  not in WE, adding it with random vector
 ****** TEST Document ****** 10349
 ****** TEST Document ****** 10350
 ****** TEST Document ****** 10351
 ****** TEST Document ****** 10352
 ****** TEST Document ****** 10353
 ****** TEST Document ****** 10354
 ****** TEST Document ****** 10355
 ****** TEST Document ****** 10356
 ****** TEST Document ****** 10357
 ****** TEST Document ****** 10358
value-oriented  not in WE, adding it with random vector
 ****** TEST Document ****** 10359
 ****** TEST Document ****** 10360
 ****** TEST Document ****** 10361
 ****** TEST Document ****** 10362
 ****** TEST Document ****** 10363
50-point  not in WE, adding it with random vector
 ****** TEST Document ****** 10364
 ****** TEST Document ****** 10365
 ****** TEST Document ****** 10366
 ****** TEST Document ****** 10367
 ****** TEST Doc

 ****** TEST Document ****** 10515
B.F.  not in WE, adding it with random vector
 ****** TEST Document ****** 10516
 ****** TEST Document ****** 10517
 ****** TEST Document ****** 10518
 ****** TEST Document ****** 10519
 ****** TEST Document ****** 10520
 ****** TEST Document ****** 10521
Intertan  not in WE, adding it with random vector
 ****** TEST Document ****** 10522
 ****** TEST Document ****** 10523
 ****** TEST Document ****** 10524
375.16  not in WE, adding it with random vector
 ****** TEST Document ****** 10525
16,800,000  not in WE, adding it with random vector
 ****** TEST Document ****** 10526
885,800  not in WE, adding it with random vector
 ****** TEST Document ****** 10527
 ****** TEST Document ****** 10528
501,200  not in WE, adding it with random vector
 ****** TEST Document ****** 10529
454,100  not in WE, adding it with random vector
 ****** TEST Document ****** 10530
331,400  not in WE, adding it with random vector
 ****** TEST Document ****** 10531
 ****** TEST 

64.1  not in WE, adding it with random vector
 ****** TEST Document ****** 10782
specialty-chemicals  not in WE, adding it with random vector
24.3  not in WE, adding it with random vector
 ****** TEST Document ****** 10783
 ****** TEST Document ****** 10784
Ohio-based  not in WE, adding it with random vector
49.125  not in WE, adding it with random vector
 ****** TEST Document ****** 10785
 ****** TEST Document ****** 10786
 ****** TEST Document ****** 10787
 ****** TEST Document ****** 10788
 ****** TEST Document ****** 10789
 ****** TEST Document ****** 10790
 ****** TEST Document ****** 10791
 ****** TEST Document ****** 10792
 ****** TEST Document ****** 10793
 ****** TEST Document ****** 10794
 ****** TEST Document ****** 10795
 ****** TEST Document ****** 10796
 ****** TEST Document ****** 10797
 ****** TEST Document ****** 10798
 ****** TEST Document ****** 10799
 ****** TEST Document ****** 10800
 ****** TEST Document ****** 10801
 ****** TEST Document ****** 10802
 ****** TEST

### Add the (key-value) 'random' word embeddings for missing inputs

In [8]:
print(len(dictadd))

4235


In [9]:
"""
for mot,vecteur in dictadd.items() : 
    wv_pre_trained.add_vector(mot,vecteur)
"""
#Prends beaucoup de temps

'\nfor mot,vecteur in dictadd.items() : \n    wv_pre_trained.add_vector(mot,vecteur)\n'

In [10]:
emb_size = 300
new_vectors = np.zeros((len(dictadd), emb_size))
for i, (word, vec) in enumerate(dictadd.items()):
    new_vectors[i] = vec
wv_pre_trained.add_vectors(list(dictadd.keys()), new_vectors)


### Store the train and test datasets: a word embedding for each token in the sequences

In [11]:
maxlen = max(max([len(doc) for doc in alldocs]),max([len(doc) for doc in alldocsT]))
wvectors = np.zeros((len(alldocs),maxlen,300))
wvectorsT = np.zeros((len(alldocsT),maxlen,300))
for i,doc in enumerate(alldocs) : 
    for j,(word,pos) in enumerate(doc) :
        wvectors[i,j,:] = wv_pre_trained[word]
        

In [12]:
for i,doc in enumerate(alldocsT) : 
    for j,(word,pos) in enumerate(doc) :
        wvectorsT[i,j,:] = wv_pre_trained[word]

### Check the size of your train/test datasets

In [13]:
print(" size de trai",wvectors.shape)
print(" size de test",wvectorsT.shape)

 size de trai (8936, 78, 300)
 size de test (2012, 78, 300)


### Collecting train/test labels

In [14]:
# Labels train/test

buf2 = [[pos for m,pos in d ] for d in alldocs]
cles = []
[cles.extend(b) for b in buf2]
cles = np.unique(np.array(cles))
cles2ind = dict(zip(cles,range(len(cles))))
nCles = len(cles)
print(nCles," keys in the dictionary")

labels  = np.array([cles2ind[pos] for d in alldocs for (m,pos) in d ])
#np.array([cles2ind[pos] for (m,pos) in d for d in alldocs])
labelsT  = np.array([cles2ind.setdefault(pos,len(cles)) for d in alldocsT for (m,pos) in d ])

print(len(cles2ind)," keys in the dictionary")

22  keys in the dictionary
23  keys in the dictionary


### Train a Logistic Regression Model! 
**An compare performances to the baseline and sequence models (HMM/CRF) or practical 2a**

In [15]:
X = np.array([wvectors[d,m,:] for d,v in enumerate(alldocs) for m,(x,pos) in enumerate(v) ])
X_test = np.array([wvectorsT[d,m,:] for d,v in enumerate(alldocsT) for m,(x,pos) in enumerate(v) ])

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X,labels)
print("le score de la bonne classification est :",clf.score(X_test,labelsT))
print(labels[:5])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


le score de la bonne classification est : 0.7718724275492328
[ 5  6  5 15 10]


# 2) Using word embedding with CRF

## We will define the following features functions for CRF

In [17]:
def features_wv(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index])
    d = {'f'+str(i):v[i] for i in range(300)}
    return d

def features_structural(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
     ## We will define the following features functions for CRF## We will define the following features functions for CRF   'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
def features_wv_plus_structural(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index]) 
    d = {'f'+str(i):v[i] for i in range(300)}

    return {**d, **features_full(sentence, index)}

## [Question]: explain what the 3 feature functions encode and what their differences are

### Reponses
***La premiere fonction prends en argument une phrase et un index d'un certain mot dans la phrase, et retourne le vecteur word2vec sous forme de chaine de caractere, elle capture donc les information sémantique du mots***

***La deuxieme fonction prends en argument une phrase et un index d'un certain mot dans la phrase, et retourne les information pertinante de ce mots, tel que le prefix, suffix,mot prec, mot suiv extr, elle capture donc les informations syntaxique du mots *** 

***Combine la premiere et la deuxieme pour capturer à la fois les informations syntaxiques et sémantique du mot***

### You can now train a CRF with the 3 features and analyse the results

In [18]:
train = []
for doc in alldocs : 
    train.extend(doc)
print(len(X))

211727


In [19]:
train_crf = list()
for i,x in enumerate(X) : 
    train_crf.append((x,labels[i]))

In [20]:
x_train = []
for doc in alldocs : 
    x_train.append([(word,str(cles2ind[pos])) for word,pos in doc ] )

In [21]:
print(x_train[:2])

[[('Confidence', '5'), ('in', '6'), ('the', '5'), ('pound', '15'), ('is', '10'), ('widely', '20'), ('expected', '20'), ('to', '20'), ('take', '20'), ('another', '5'), ('sharp', '15'), ('dive', '15'), ('if', '8'), ('trade', '5'), ('figures', '15'), ('for', '6'), ('September', '5'), (',', '21'), ('due', '0'), ('for', '6'), ('release', '5'), ('tomorrow', '5'), (',', '21'), ('fail', '10'), ('to', '20'), ('show', '20'), ('a', '5'), ('substantial', '15'), ('improvement', '15'), ('from', '6'), ('July', '5'), ('and', '15'), ('August', '15'), ("'s", '5'), ('near-record', '15'), ('deficits', '15'), ('.', '21')], [('Chancellor', '21'), ('of', '6'), ('the', '5'), ('Exchequer', '15'), ('Nigel', '5'), ('Lawson', '15'), ("'s", '5'), ('restated', '15'), ('commitment', '15'), ('to', '6'), ('a', '5'), ('firm', '15'), ('monetary', '15'), ('policy', '15'), ('has', '10'), ('helped', '20'), ('to', '20'), ('prevent', '20'), ('a', '5'), ('freefall', '15'), ('in', '6'), ('sterling', '5'), ('over', '6'), ('the'

In [22]:
from nltk.tag.crf import CRFTagger

# Entraîner le tagger CRF sur les données d'entraînement
ct = CRFTagger(feature_func=features_structural)
ct.train(x_train, 'model.crf.tagger')

# Charger le modèle CRF entraîné
## Train the model                  
## Evaluate performances

In [24]:
x_test = []
for doc in alldocsT: 
    x_test.append([(word,str(cles2ind[pos])) for word,pos in doc ] )

In [28]:
ct.evaluate(x_test)

0.9385566836228549