## word2vec: How to Prep Word Vectors For Modeling

### Train Our Own Model

In [2]:
# Read in the data, clean it, split it into train and test sets, then train a word2vec model.
import gensim
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('data/spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                  window=5,
                                  size=100,
                                  min_count=2)

### Prep Word Vectors

In [3]:
# Generate a list of words the word2vec model learned word vectors for from the training set
w2v_model.wv.index2word

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'have',
 'that',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'or',
 'not',
 'at',
 'we',
 'get',
 'do',
 'be',
 'ur',
 'will',
 'just',
 'with',
 'if',
 'no',
 'this',
 'how',
 'up',
 'ok',
 'lt',
 'gt',
 'what',
 'from',
 'when',
 'free',
 'out',
 'all',
 'go',
 'know',
 'll',
 'good',
 'he',
 'day',
 'then',
 'like',
 'got',
 'love',
 'was',
 'come',
 'there',
 'its',
 'am',
 'time',
 'only',
 'send',
 'want',
 'text',
 'going',
 'as',
 'txt',
 'about',
 'one',
 'lor',
 'by',
 'need',
 'today',
 'home',
 'don',
 'she',
 'see',
 'back',
 'still',
 'stop',
 'reply',
 'tell',
 'hi',
 'our',
 'mobile',
 'they',
 'da',
 'sorry',
 'dont',
 'please',
 'think',
 'any',
 'pls',
 'phone',
 'take',
 'did',
 'here',
 'her',
 'later',
 'dear',
 'new',
 'been',
 'some',
 'week',
 'oh',
 'has',
 'well',
 're',
 'much',
 'an',
 'msg',
 'ì_',
 'happy',
 'where',
 'who',
 'had',
 'more',
 'hey',
 'gr

In [4]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vector = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) 
                       for ls in X_test])

In [6]:
w2v_vector[:2]

array([array([[-2.44618766e-03,  9.47132986e-03,  1.14064571e-02,
        -8.88752751e-03, -1.11107565e-02,  5.39666926e-03,
        -7.99187273e-03,  1.61998793e-02,  8.19164794e-03,
         7.64484005e-03,  1.58768904e-03, -7.66565325e-03,
         4.00123280e-03,  7.95783848e-03,  1.53262289e-02,
         1.06490813e-02, -2.15876866e-02, -9.23308451e-03,
        -5.01833018e-03, -8.34206305e-03, -2.65748743e-02,
         1.60395615e-02, -1.54432119e-03,  2.10749060e-02,
        -1.37205431e-02,  7.80236442e-03,  1.93993766e-02,
         1.01200538e-03,  1.53130144e-02,  1.11633688e-02,
         1.19761238e-02, -6.89264573e-03,  4.21541324e-03,
        -2.34597293e-03,  1.07810842e-02,  8.39868281e-03,
        -6.23097643e-03,  2.28636456e-03, -8.19521118e-03,
        -1.06208762e-02, -1.82849891e-03,  4.76539321e-03,
        -6.57498743e-03,  1.07365772e-02, -5.90915047e-03,
        -4.26592864e-03, -2.72894395e-03,  6.30639610e-04,
         2.62111216e-03,  4.58114548e-03, -4.0979

In [22]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vector):
    print(len(X_test.iloc[i]), len(v))

6 6
24 24
5 5
9 7
4 1
59 57
5 4
15 11
8 5
9 7
5 4
13 13
17 11
30 25
20 19
22 21
9 5
22 22
10 10
13 13
26 26
5 5
58 53
7 7
13 12
4 3
6 5
25 24
30 27
9 8
18 16
23 22
8 7
8 8
7 5
8 6
22 21
5 5
26 18
8 8
9 7
12 11
7 7
14 14
6 5
10 10
30 28
20 17
27 26
20 18
10 10
13 11
21 15
14 12
5 5
32 29
21 18
8 8
18 17
17 13
28 18
23 20
14 12
5 3
5 5
11 10
5 5
23 22
30 25
1 1
6 6
17 16
5 4
11 10
6 6
3 1
7 6
8 2
17 17
24 24
5 5
6 3
18 16
8 8
6 6
12 12
1 1
21 20
6 6
41 38
11 11
7 7
4 4
8 8
15 15
10 8
7 6
4 4
4 4
3 3
13 13
10 10
22 22
7 7
3 3
50 43
13 11
18 16
6 5
13 12
8 4
10 9
19 17
14 13
14 10
9 9
7 7
5 4
5 5
26 26
6 3
18 16
6 6
18 14
9 9
11 11
11 9
22 20
7 7
26 26
30 28
12 12
23 22
11 10
5 5
7 7
10 10
9 9
7 7
8 8
7 5
22 18
27 23
5 5
8 8
30 29
18 16
6 6
29 29
6 6
30 30
25 25
7 7
16 14
3 3
7 7
36 33
4 4
23 18
6 6
20 15
10 10
4 4
38 30
7 6
16 14
6 6
22 22
7 7
4 4
4 4
6 6
20 19
23 23
20 20
10 10
13 13
56 54
10 9
12 12
25 21
0 0
14 13
25 21
1 1
4 4
10 10
4 2
6 5
28 26
6 6
9 9
14 13
3 3
23 22
29 28
2 2
9 9


In [34]:
# Compute sentence vector by averaging the word vectors for the words contained in the sentence.
w2v_vector_avg = []

for vect in w2v_vector:
    if len(vect) != 0:
        w2v_vector_avg.append(vect.mean(axis=0))
    else:
        w2v_vector_avg.append(np.zeros(100))

In [35]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vector_avg):
    print(len(X_test.iloc[i]), len(v))

6 100
24 100
5 100
9 100
4 100
59 100
5 100
15 100
8 100
9 100
5 100
13 100
17 100
30 100
20 100
22 100
9 100
22 100
10 100
13 100
26 100
5 100
58 100
7 100
13 100
4 100
6 100
25 100
30 100
9 100
18 100
23 100
8 100
8 100
7 100
8 100
22 100
5 100
26 100
8 100
9 100
12 100
7 100
14 100
6 100
10 100
30 100
20 100
27 100
20 100
10 100
13 100
21 100
14 100
5 100
32 100
21 100
8 100
18 100
17 100
28 100
23 100
14 100
5 100
5 100
11 100
5 100
23 100
30 100
1 100
6 100
17 100
5 100
11 100
6 100
3 100
7 100
8 100
17 100
24 100
5 100
6 100
18 100
8 100
6 100
12 100
1 100
21 100
6 100
41 100
11 100
7 100
4 100
8 100
15 100
10 100
7 100
4 100
4 100
3 100
13 100
10 100
22 100
7 100
3 100
50 100
13 100
18 100
6 100
13 100
8 100
10 100
19 100
14 100
14 100
9 100
7 100
5 100
5 100
26 100
6 100
18 100
6 100
18 100
9 100
11 100
11 100
22 100
7 100
26 100
30 100
12 100
23 100
11 100
5 100
7 100
10 100
9 100
7 100
8 100
7 100
22 100
27 100
5 100
8 100
30 100
18 100
6 100
29 100
6 100
30 100
25 100
7 100
