# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [2]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index2word

[&#39;to&#39;,
 &#39;you&#39;,
 &#39;the&#39;,
 &#39;and&#39;,
 &#39;is&#39;,
 &#39;in&#39;,
 &#39;me&#39;,
 &#39;my&#39;,
 &#39;it&#39;,
 &#39;for&#39;,
 &#39;your&#39;,
 &#39;of&#39;,
 &#39;that&#39;,
 &#39;call&#39;,
 &#39;have&#39;,
 &#39;on&#39;,
 &#39;are&#39;,
 &#39;now&#39;,
 &#39;can&#39;,
 &#39;so&#39;,
 &#39;not&#39;,
 &#39;but&#39;,
 &#39;or&#39;,
 &#39;do&#39;,
 &#39;we&#39;,
 &#39;at&#39;,
 &#39;be&#39;,
 &#39;get&#39;,
 &#39;no&#39;,
 &#39;if&#39;,
 &#39;will&#39;,
 &#39;just&#39;,
 &#39;ur&#39;,
 &#39;with&#39;,
 &#39;this&#39;,
 &#39;how&#39;,
 &#39;lt&#39;,
 &#39;gt&#39;,
 &#39;up&#39;,
 &#39;ok&#39;,
 &#39;when&#39;,
 &#39;go&#39;,
 &#39;out&#39;,
 &#39;ll&#39;,
 &#39;what&#39;,
 &#39;from&#39;,
 &#39;all&#39;,
 &#39;free&#39;,
 &#39;know&#39;,
 &#39;got&#39;,
 &#39;like&#39;,
 &#39;am&#39;,
 &#39;then&#39;,
 &#39;he&#39;,
 &#39;good&#39;,
 &#39;there&#39;,
 &#39;come&#39;,
 &#39;its&#39;,
 &#39;day&#39;,
 &#39;time&#39;,
 &#39;love&#39;,
 &#39;only&#39;,
 &#39;was&#

In [3]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])

In [4]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

7 7
9 7
5 5
14 14
12 9
9 9
10 10
30 29
17 16
7 7
14 14
7 6
5 4
14 11
5 5
27 24
14 14
22 19
8 8
5 5
9 9
6 6
7 6
7 7
21 18
30 30
3 3
3 3
18 17
6 6
8 8
9 7
7 5
22 22
9 9
9 9
4 4
24 18
17 13
17 17
30 29
27 24
5 5
20 20
4 4
5 5
20 19
23 23
19 16
14 14
30 26
20 19
15 14
19 17
4 4
10 10
27 25
25 25
30 27
23 21
27 27
8 8
5 5
6 5
9 9
13 12
12 12
6 6
15 13
21 18
25 25
9 8
26 21
24 23
29 27
27 25
18 18
4 4
9 9
22 22
14 13
20 18
11 9
14 13
8 8
9 9
28 27
10 10
4 1
16 12
6 5
14 12
16 16
16 15
9 8
19 19
3 1
4 3
7 7
10 9
5 4
5 5
3 1
6 6
15 9
10 9
11 9
8 6
21 20
50 44
14 13
26 21
9 8
24 21
6 6
17 11
9 9
8 7
8 6
6 5
15 13
4 4
24 22
7 7
6 4
28 28
4 4
8 8
22 18
9 8
22 21
7 7
26 25
15 11
22 19
8 7
16 12
6 6
4 4
25 25
18 18
9 8
15 15
25 24
4 4
27 27
16 13
9 9
11 11
9 7
7 7
13 12
3 3
27 26
11 10
8 8
12 8
8 8
26 21
9 9
3 3
25 20
9 8
1 1
26 26
15 15
22 22
26 22
24 22
10 9
8 7
8 8
5 5
12 8
18 15
10 9
9 9
21 21
19 17
10 7
9 8
8 8
15 15
9 9
6 6
16 15
14 14
7 7
9 7
18 16
9 9
6 6
5 5
16 13
8 8
4 4
21 20
6 6
5 4
27 

In [5]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [6]:
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

7 100
9 100
5 100
14 100
12 100
9 100
10 100
30 100
17 100
7 100
14 100
7 100
5 100
14 100
5 100
27 100
14 100
22 100
8 100
5 100
9 100
6 100
7 100
7 100
21 100
30 100
3 100
3 100
18 100
6 100
8 100
9 100
7 100
22 100
9 100
9 100
4 100
24 100
17 100
17 100
30 100
27 100
5 100
20 100
4 100
5 100
20 100
23 100
19 100
14 100
30 100
20 100
15 100
19 100
4 100
10 100
27 100
25 100
30 100
23 100
27 100
8 100
5 100
6 100
9 100
13 100
12 100
6 100
15 100
21 100
25 100
9 100
26 100
24 100
29 100
27 100
18 100
4 100
9 100
22 100
14 100
20 100
11 100
14 100
8 100
9 100
28 100
10 100
4 100
16 100
6 100
14 100
16 100
16 100
9 100
19 100
3 100
4 100
7 100
10 100
5 100
5 100
3 100
6 100
15 100
10 100
11 100
8 100
21 100
50 100
14 100
26 100
9 100
24 100
6 100
17 100
9 100
8 100
8 100
6 100
15 100
4 100
24 100
7 100
6 100
28 100
4 100
8 100
22 100
9 100
22 100
7 100
26 100
15 100
22 100
8 100
16 100
6 100
4 100
25 100
18 100
9 100
15 100
25 100
4 100
27 100
16 100
9 100
11 100
9 100
7 100
13 100
3 100

In [None]:
# Are our sentence vector lengths consistent?