# Understand embeddings with Word2Vec

## Imports

In [1]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences




# The data

In [2]:
def load_data(percentage_of_sentences=None):
    
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)
    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
    
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)







# Embedding with Word2Vec

In [3]:
word2vec = Word2Vec(sentences=X_train)
wv = word2vec.wv

In [4]:
wv['knight']

array([ 9.79817845e-03,  3.58402133e-02, -7.73288496e-03,  2.04245318e-02,
       -1.66255366e-02, -3.17515284e-02,  4.34703156e-02,  8.39115679e-02,
       -5.70392311e-02, -5.10751158e-02, -4.76590649e-04, -3.33375931e-02,
        8.73292610e-03,  1.97258834e-02,  2.16153786e-02, -6.65407628e-03,
        2.12050416e-02,  5.67192212e-03, -1.69537142e-02, -6.85474500e-02,
        7.71883829e-03,  6.96427515e-03,  4.75088432e-02, -2.54619736e-02,
       -2.72276849e-02,  1.68444440e-02, -1.14220278e-02, -2.50825174e-02,
       -2.17112694e-02, -3.74236370e-05,  5.84678585e-03, -7.67631643e-03,
       -1.04865460e-02, -5.13904803e-02, -1.37228128e-02,  8.40837543e-04,
        1.69932358e-02, -3.02275363e-02, -2.62192171e-02, -4.19451594e-02,
        7.91553315e-03, -2.41315495e-02, -2.49342602e-02,  6.91198092e-03,
        2.00839750e-02,  7.35416543e-03, -2.40414385e-02, -7.28767365e-03,
        2.19867527e-02,  4.13249731e-02, -2.40681004e-02, -2.81545706e-02,
       -3.18891965e-02, -

In [5]:
len(wv['oscar'])

100

In [6]:
wv.most_similar('fantasy')

[('factor', 0.9824349284172058),
 ('slasher', 0.9806401133537292),
 ('adventure', 0.9792971611022949),
 ('suspense', 0.9752942323684692),
 ('giallo', 0.9736719727516174),
 ('italian', 0.9736379384994507),
 ('meaning', 0.9734928607940674),
 ('romantic', 0.9727501273155212),
 ('worthy', 0.9715440273284912),
 ('comedic', 0.9706475734710693)]

In [7]:
wv.similar_by_vector(wv['documentary'])

[('documentary', 1.0),
 ('addition', 0.9715887308120728),
 ('adaptation', 0.9654055237770081),
 ('garbage', 0.9634276628494263),
 ('cartoon', 0.9610533714294434),
 ('travesty', 0.9601757526397705),
 ('masterpiece', 0.9569209814071655),
 ('discussion', 0.9536325335502625),
 ('giallo', 0.9530635476112366),
 ('effort', 0.9525585174560547)]

# Arithmetic on words

Operation 𝑊2𝑉(𝑔𝑜𝑜𝑑)−𝑊2𝑉(𝑏𝑎𝑑)

In [8]:
wv["good"] - wv["bad"]

array([-0.32826334, -0.18007712,  0.19266295,  0.41789162,  0.20413888,
       -0.13762212, -0.30661184, -0.03371975, -0.08835679, -0.15330702,
        0.10232968, -0.03346622,  0.02072523, -0.08572066,  0.05916902,
        0.29351553, -0.48769492,  0.09134352, -0.17553502,  0.10114145,
       -0.08146876,  0.05736831,  0.00120389, -0.04110202,  0.00691032,
       -0.25146663, -0.16012457,  0.7843769 , -0.10849836, -0.592752  ,
       -0.26980054, -0.04741967,  0.01398838,  0.31159246,  0.16412437,
       -0.6458645 ,  0.2671558 , -0.0597041 , -0.3652428 ,  0.7549727 ,
       -0.22649336, -0.09002572, -0.45573398, -0.10550368,  0.32737455,
       -0.18595165,  0.45688128,  0.02859676, -0.14077449,  0.4594715 ,
       -0.1680378 , -0.02751219, -0.37125528,  0.02410811,  0.30595976,
        0.0523642 , -0.24461532,  0.22945629,  0.13867629,  0.61166173,
       -0.01224725,  0.41941947, -0.27756816, -0.4436978 ,  0.09350646,
        0.08268797, -0.32354078,  0.05530882,  0.45948005,  0.53

Operation 𝑊2𝑉(𝑔𝑜𝑜𝑑)−𝑊2𝑉(𝑏𝑎𝑑)+𝑊2𝑉(𝑠𝑡𝑢𝑝𝑖𝑑)

In [9]:
res = wv["good"] - wv["bad"] + wv["stupid"]

In [10]:
wv.similar_by_vector(res)

[('nice', 0.8146198391914368),
 ('tough', 0.7606254816055298),
 ('good', 0.7597443461418152),
 ('nonetheless', 0.758880078792572),
 ('always', 0.7550683617591858),
 ('misfortune', 0.7497318387031555),
 ('considered', 0.7483559846878052),
 ('potential', 0.7470344305038452),
 ('also', 0.7461305260658264),
 ('given', 0.7454132437705994)]

Operation 𝑊2𝑉(𝑄𝑢𝑒𝑒𝑛)−𝑊2𝑉(𝐾𝑖𝑛𝑔)=𝑊2𝑉(𝑎𝑐𝑡𝑟𝑒𝑠𝑠)−𝑊2𝑉(𝑎𝑐𝑡𝑜𝑟)

In [11]:
res = wv['queen'] - wv['king'] + wv['actor']

In [12]:
wv.similar_by_vector(res)

[('actor', 0.9755123257637024),
 ('performance', 0.8798442482948303),
 ('role', 0.8682138919830322),
 ('guy', 0.8359900116920471),
 ('actress', 0.8345870971679688),
 ('man', 0.8162053227424622),
 ('character', 0.8144716024398804),
 ('job', 0.8142083883285522),
 ('acquaintance', 0.7731799483299255),
 ('admiration', 0.7702587246894836)]

# Word2Vec hyperparameters

Verify on some words that the embedding size is the one you chose.

In [13]:
word2vec_2 = Word2Vec(sentences=X_train, vector_size=50)
wv2 = word2vec_2.wv
len(wv2['science'])

50

❓ Use the Word2Vec.wv.key_to_index attribute to display the size of the learned vocabulary.

In [14]:
print('Vocabulary size', len(wv2.key_to_index))

diff_words = set([_ for elt in X_train for _ in elt])
print('Number of different words in the train set', len(diff_words))

Vocabulary size 8006
Number of different words in the train set 30419


❓ Learn a new word2vec_3 model with a min_count higher than 5 (which is the default value) and a word2vec_4 with a min_count smaller than 5

In [15]:
word2vec_3 = Word2Vec(sentences=X_train, vector_size=50, min_count=21)
word2vec_4 = Word2Vec(sentences=X_train, vector_size=50, min_count=4)

print(f'Number of word in W2V #1 : {len(wv.key_to_index)}')
print(f'Number of word in W2V #2 : {len(wv2.key_to_index)}')
print(f'Number of word in W2V #3 : {len(word2vec_3.wv.key_to_index)}')
print(f'Number of word in W2V #4 : {len(word2vec_4.wv.key_to_index)}')

Number of word in W2V #1 : 8006
Number of word in W2V #2 : 8006
Number of word in W2V #3 : 2444
Number of word in W2V #4 : 9584


❓ Train a new word2vec_5 model with a window different than previously (default is 5).

In [16]:
word2vec_5 = Word2Vec(sentences=X_train, vector_size=50, min_count=40, window=11)

# Convert our train and test set to RNN-ready datasets

In [17]:
example = [
    'this',
    'movie',
    'is', 
    'the',
    'worst', 
    'action',
    'movie',
    'ever'
]
example_missing_words = ['this', 'movie', 'is', 'laaaaaaaaaame']

def embed_sentence(word2vec, sentence):
    matrix = []
    for word in sentence:
        if word in word2vec.wv:
            matrix.append(word2vec.wv[word])
    
    return np.array(matrix)

### Checks
embedded_sentence = embed_sentence(word2vec, example)
assert(type(embedded_sentence) == np.ndarray)
assert(embedded_sentence.shape == (8, 100))

embedded_sentence_missing_words = embed_sentence(word2vec, example_missing_words)  
assert(type(embedded_sentence_missing_words) == np.ndarray)
assert(embedded_sentence_missing_words.shape == (3, 100))

In [18]:
def embedding(word2vec, sentences):
    list_sent = []
    for sentence in sentences:
        list_sent.append(embed_sentence(word2vec, sentence))
    return list_sent
        
    
X_train = embedding(word2vec, X_train)
X_test = embedding(word2vec, X_test)

In [19]:
X_train_pad = pad_sequences(X_train, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test, dtype='float32', padding='post')

assert(len(X_train_pad.shape) == 3)
assert(len(X_test_pad.shape) == 3)
assert(X_train_pad.shape[2] == 100)
assert(X_test_pad.shape[2] == 100)