# Word Embedding for Sequence Processing

**The goal of this practical is to use pre-trained word embedding for adressing the sequence prediction tasks studied in week 2: PoS and chunking.**

In [None]:
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive

<module 'google.colab.drive' from '/usr/local/lib/python3.10/dist-packages/google/colab/drive.py'>

## 0) Loading PoS (or chunking) datasets (small or large)

In [None]:
def load(filename):
    listeDoc = list()
    with open(filename, "r") as f:
        doc = list()
        for ligne in f:
            #print "l : ",len(ligne)," ",ligne
            if len(ligne) < 2: # fin de doc
                listeDoc.append(doc)
                doc = list()
                continue
            mots = ligne.replace("\n","").split(" ")
            doc.append((mots[0],mots[2])) # mettre mots[2] à la place de mots[1] pour le chuncking
    return listeDoc

In [None]:
bSmall = False

if(bSmall==True):
    filename = "/content/drive/My Drive/chtrain.txt"
    filenameT = "/content/drive/My Drive/chtest.txt"

else:
    # Larger corpus .
    filename = "/content/drive/My Drive/train.txt"
    filenameT = "/content/drive/My Drive/test.txt"

alldocs = load(filename)
alldocsT = load(filenameT)

print(len(alldocs)," docs read")
print(len(alldocsT)," docs (T) read")

8936  docs read
2012  docs (T) read


# 1) Word embedding for classifying each word

### Pre-trained word2vec

In [None]:
import gensim.downloader as api
bload = True
fname = "word2vec-google-news-300"
sdir = "/content/drive/My Drive/" # Change

if(bload==True):
    wv_pre_trained = KeyedVectors.load(sdir+fname+".dat")
else:
    wv_pre_trained = api.load(fname)
    wv_pre_trained.save(sdir+fname+".dat")

### Some token on the dataset are missing, we will encode them with a random vector
This is sub-optimal, but we need to do something

In [None]:
def randomvec():
    default = np.random.randn(300)
    default = default  / np.linalg.norm(default)
    return default

In [None]:
np.random.seed(seed=10) # seed the randomness

dictadd = dict()
cpt=0
for d in alldocs:
    cpt+=1
    print(" ****** Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()

for d in alldocsT:
    cpt+=1
    print(" ****** TEST Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            #wv_pre_trained.add_vector(x,randomvec())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 ****** Document ****** 7149
 ****** Document ****** 7150
 ****** Document ****** 7151
 ****** Document ****** 7152
 ****** Document ****** 7153
 ****** Document ****** 7154
 ****** Document ****** 7155
 ****** Document ****** 7156
409  not in WE, adding it with random vector
 ****** Document ****** 7157
 ****** Document ****** 7158
 ****** Document ****** 7159
 ****** Document ****** 7160
 ****** Document ****** 7161
 ****** Document ****** 7162
122  not in WE, adding it with random vector
 ****** Document ****** 7163
 ****** Document ****** 7164
 ****** Document ****** 7165
 ****** Document ****** 7166
 ****** Document ****** 7167
 ****** Document ****** 7168
school-lunch  not in WE, adding it with random vector
 ****** Document ****** 7169
emergency-relief  not in WE, adding it with random vector
 ****** Document ****** 7170
 ****** Document ****** 7171
 ****** Document ****** 7172
follow-on  not in WE, adding it with 

### Add the (key-value) 'random' word embeddings for missing inputs

In [None]:
## YOUR CODE HERE
wv_pre_trained.add_vectors(list(dictadd.keys()), list(dictadd.values()))

### Store the train and test datasets: a word embedding for each token in the sequences

In [None]:
wvectors = [wv_pre_trained[x] for d in alldocs for x,pos in d ]
wvectorsT = [wv_pre_trained[x] for d in alldocsT for x,pos in d ]

### Check the size of your train/test datasets

In [None]:
## YOUR CODE HERE
print("La taille de datasets de train est de  : ",len(wvectors))
print("La taille de datasets de test est de  : ",len(wvectorsT))

La taille de datasets de train est de  :  211727
La taille de datasets de test est de  :  47377


### Collecting train/test labels

In [None]:
# Labels train/test

buf2 = [[pos for m,pos in d ] for d in alldocs]
cles = []
[cles.extend(b) for b in buf2]
cles = np.unique(np.array(cles))
cles2ind = dict(zip(cles,range(len(cles))))
nCles = len(cles)
print(nCles," keys in the dictionary")

labels  = np.array([cles2ind[pos] for d in alldocs for (m,pos) in d ])
#np.array([cles2ind[pos] for (m,pos) in d for d in alldocs])
labelsT  = np.array([cles2ind.setdefault(pos,len(cles)) for d in alldocsT for (m,pos) in d ])

print(len(cles2ind)," keys in the dictionary")

22  keys in the dictionary
23  keys in the dictionary


In [None]:
print(labels.shape)
print(labelsT.shape)

(211727,)
(47377,)


In [None]:
labelsT

array([ 5, 15, 15, ...,  5, 15, 21])

### Train a Logistic Regression Model!
**An compare performances to the baseline and sequence models (HMM/CRF) or practical 2a**

In [None]:
## YOUR CODE HERE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Scikit Logistic Regression

lr_clf = LogisticRegression()

lr_clf.fit(wvectors, labels)
pred_lrt = lr_clf.predict(wvectors)
pred_lr = lr_clf.predict(wvectorsT)
print(f"Logistic Regression accuracy train={accuracy_score(labels, pred_lrt)}, accuracy test={accuracy_score(labelsT, pred_lr)}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression accuracy train=0.7754986373962697, accuracy test=0.7718724275492328


# 2) Using word embedding with CRF

## We will define the following features functions for CRF

In [None]:
def features_wv(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index])
    d = {'f'+str(i):v[i] for i in range(300)}
    return d

def features_structural(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
     ## We will define the following features functions for CRF## We will define the following features functions for CRF   'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
def features_wv_plus_structural(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index])
    d = {'f'+str(i):v[i] for i in range(300)}

    return {**d, **features_structural(sentence, index)}

## [Question]: explain what the 3 feature functions encode and what their differences are

**features_wv(sentence, index) :**

Cette fonction extrait des caractéristiques basées sur des vecteurs de mots pré-entraînés.
Elle récupère le vecteur de mots correspondant à un mot donné dans une phrase à partir des embeddings pré-entraînés.
Ensuite, elle crée un dictionnaire de caractéristiques où chaque dimension du vecteur de mots est associée à une caractéristique.

**features_structural(sentence, index) :**

Cette fonction extrait des caractéristiques structurelles ou lexicales des mots dans une phrase.
Les caractéristiques incluent des informations telles que la position du mot dans la phrase, s'il est en majuscules, s'il contient des caractères spéciaux, etc.

**features_wv_plus_structural(sentence, index) :**

Cette fonction combine les caractéristiques extraites à partir des embeddings de mots pré-entraînés et des caractéristiques structurelles.
Elle fusionne les informations sémantiques des embeddings de mots avec les informations structurelles et lexicales des mots dans une seule représentation pour chaque mot.


En résumé, ces fonctions permettent de capturer à la fois les informations sémantiques et structurelles des mots .

### You can now train a CRF with the 3 features and analyse the results

In [None]:
!pip install nltk



In [None]:
!pip install python-crfsuite

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.10


In [None]:
from nltk.tag.crf import CRFTagger
tagger = CRFTagger(feature_func=features_wv)
## Train the model
tagger.train(alldocs,'model.crf.tagger_features_wv')
## Evaluate performances
tagger.evaluate(alldocsT)


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  tagger.evaluate(alldocsT)


0.8816092196635498

In [None]:
tagger = CRFTagger(feature_func=features_structural)
## Train the model
tagger.train(alldocs,'model.crf.features_structural')
## Evaluate performances
tagger.evaluate((alldocsT))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  tagger.evaluate((alldocsT))


0.9384722544694684

In [None]:
tagger = CRFTagger(feature_func=features_wv_plus_structural)
## Train the model
tagger.train(alldocs,'model.crf.features_wv_plus_structural')
## Evaluate performances
tagger.evaluate((alldocsT))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  tagger.evaluate((alldocsT))


0.9451632648753615