##### Train our own Model:

In [2]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'],
                                                    test_size=0.2)
w2v_model = gensim.models.Word2Vec(X_train, 
                                   size=100, 
                                   window=5, 
                                   min_count=2)

Generate a list of words the word2vec model

In [3]:
w2v_model.wv.index2word

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'it',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'can',
 'are',
 'now',
 'so',
 'not',
 'but',
 'or',
 'do',
 'at',
 'we',
 'get',
 'be',
 'will',
 'with',
 'if',
 'ur',
 'just',
 'no',
 'gt',
 'lt',
 'this',
 'up',
 'how',
 'ok',
 'free',
 'what',
 'when',
 'go',
 'all',
 'out',
 'from',
 'know',
 'll',
 'good',
 'day',
 'like',
 'was',
 'then',
 'got',
 'am',
 'come',
 'there',
 'he',
 'only',
 'its',
 'time',
 'text',
 'love',
 'want',
 'send',
 'one',
 'as',
 'need',
 'about',
 'txt',
 'by',
 'going',
 'she',
 'lor',
 'don',
 'today',
 'stop',
 'home',
 'still',
 'sorry',
 'reply',
 'see',
 'mobile',
 'pls',
 'dont',
 'take',
 'back',
 'tell',
 'our',
 'da',
 'new',
 'any',
 'later',
 'please',
 'think',
 'hi',
 'they',
 'her',
 'dear',
 'ì_',
 'been',
 'week',
 'much',
 'some',
 'did',
 'phone',
 'night',
 'well',
 'who',
 'hey',
 'hope',
 'oh',
 'happy',
 'has',
 'where',
 'here',
 'wat',
 'give',
 'claim

Generate aggregated sentence vectors based on the word vectors for each word in the sentence

In [4]:
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) 
                        for ls in X_test])

  


In [5]:
for i, v in enumerate(w2v_vect):
  print(len(X_test.iloc[i]),len(v))

7 6
15 13
4 4
7 7
7 7
31 29
9 8
15 12
19 17
1 1
16 16
9 9
26 20
5 5
4 4
15 15
13 13
22 22
5 5
9 9
28 27
6 6
4 4
10 7
9 7
15 13
4 4
23 23
9 6
7 7
25 25
8 8
9 9
29 28
2 2
8 7
9 9
3 3
1 1
10 9
37 34
11 11
21 21
25 20
9 9
20 20
9 9
4 4
12 12
12 10
5 2
3 3
4 4
5 5
7 5
28 25
15 15
12 12
16 16
7 7
14 13
6 4
24 19
17 16
9 9
16 16
6 6
33 30
17 7
25 20
9 7
7 5
22 21
5 5
10 10
20 20
13 13
7 5
6 6
5 5
8 8
5 5
14 14
12 10
24 24
11 11
7 5
23 23
22 21
22 21
24 19
28 28
9 7
25 22
14 12
16 15
24 23
6 5
11 11
8 7
17 16
7 7
18 18
38 35
14 12
5 5
10 9
30 28
8 7
14 13
5 5
12 12
20 17
6 6
32 28
28 24
19 19
5 4
10 9
27 25
6 6
7 6
8 6
4 3
7 6
3 1
14 13
7 7
17 16
16 15
7 5
22 18
7 6
16 15
7 7
21 21
27 23
10 10
18 17
9 9
30 30
14 13
14 10
15 14
12 12
20 20
6 5
3 2
7 6
4 4
28 25
8 7
13 11
10 10
20 15
9 9
9 8
22 22
11 10
5 4
9 9
25 24
17 9
5 5
41 36
26 22
28 28
4 4
8 6
11 9
23 23
7 7
10 10
58 52
22 22
8 8
28 27
4 4
7 6
9 9
25 24
18 17
23 22
30 25
6 6
13 11
9 8
9 7
33 31
19 19
27 23
13 12
14 14
12 12
13 10
11 11
1

Compute sentence vectors by averaging the word vectors for the words contained in the sentence.

In [6]:
w2v_vect_avg = []

for vect in w2v_vect:
  if len(vect)!=0:
    w2v_vect_avg.append(vect.mean(axis=0))
  else:
    w2v_vect_avg.append(np.zeros(100))

In [7]:
for i, v in enumerate(w2v_vect_avg):
  print(len(X_test.iloc[i]), len(v))

7 100
15 100
4 100
7 100
7 100
31 100
9 100
15 100
19 100
1 100
16 100
9 100
26 100
5 100
4 100
15 100
13 100
22 100
5 100
9 100
28 100
6 100
4 100
10 100
9 100
15 100
4 100
23 100
9 100
7 100
25 100
8 100
9 100
29 100
2 100
8 100
9 100
3 100
1 100
10 100
37 100
11 100
21 100
25 100
9 100
20 100
9 100
4 100
12 100
12 100
5 100
3 100
4 100
5 100
7 100
28 100
15 100
12 100
16 100
7 100
14 100
6 100
24 100
17 100
9 100
16 100
6 100
33 100
17 100
25 100
9 100
7 100
22 100
5 100
10 100
20 100
13 100
7 100
6 100
5 100
8 100
5 100
14 100
12 100
24 100
11 100
7 100
23 100
22 100
22 100
24 100
28 100
9 100
25 100
14 100
16 100
24 100
6 100
11 100
8 100
17 100
7 100
18 100
38 100
14 100
5 100
10 100
30 100
8 100
14 100
5 100
12 100
20 100
6 100
32 100
28 100
19 100
5 100
10 100
27 100
6 100
7 100
8 100
4 100
7 100
3 100
14 100
7 100
17 100
16 100
7 100
22 100
7 100
16 100
7 100
21 100
27 100
10 100
18 100
9 100
30 100
14 100
14 100
15 100
12 100
20 100
6 100
3 100
7 100
4 100
28 100
8 100
13 100