##### Train Our Model

In [3]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'],
                                                    test_size=0.2)


Create tagged document objects to prepare to train the model

In [7]:
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

Look at what a tagged document look like

In [8]:
tagged_docs[0]

TaggedDocument(words=['shall', 'send', 'that', 'exe', 'to', 'your', 'mail', 'id'], tags=[0])

Train a basi doc2vec model

In [11]:
d2v_model = gensim.models.Doc2Vec(tagged_docs,
                            vector_size=100,
                            window=5,
                            min_count=2)

In [12]:
d2v_model.infer_vector(['i', 'am', 'learning', 'nlp'])

array([ 0.00716958,  0.00027503, -0.00260676, -0.00400441,  0.00180881,
       -0.00657149,  0.00306373, -0.00191726, -0.00041694,  0.00259314,
        0.00077765,  0.00051703,  0.01558096, -0.0003822 , -0.00184831,
       -0.00262939, -0.00449907, -0.00070632,  0.0031354 , -0.0003083 ,
       -0.00397278, -0.00801584,  0.00299628,  0.00597374, -0.00569987,
       -0.00201413, -0.00558803,  0.00230666,  0.00254088, -0.00104074,
        0.00310108,  0.00323467,  0.00287635,  0.00685418,  0.00202712,
        0.00733043, -0.00119335, -0.00386045, -0.00321012, -0.0066831 ,
       -0.00194252, -0.00084058,  0.00326964,  0.01052613, -0.00156975,
        0.00132614, -0.00112078,  0.00865758, -0.00095231,  0.00175074,
       -0.00172746, -0.008258  , -0.00051792,  0.00466289,  0.00263454,
       -0.0034246 , -0.00522846, -0.00471878, -0.00855974, -0.00336784,
        0.00545612,  0.00527416, -0.00261032,  0.00319114, -0.00574096,
        0.00428627, -0.00632285,  0.00458814,  0.00895855, -0.00

In [13]:
vector = [[d2v_model.infer_vector(words)] for words in X_test]

In [14]:
vector[0]

[array([ 7.3615662e-03,  4.7263579e-04,  1.4440301e-04, -5.6174663e-03,
         1.6102948e-03, -2.7893861e-03, -1.5238228e-03, -2.6911905e-04,
         6.8386021e-04,  2.8768952e-03, -4.5528868e-04, -3.0214808e-04,
         1.8059995e-03, -2.2595346e-03, -3.9907512e-03,  5.1111056e-05,
        -4.7806925e-03, -1.4999351e-03, -1.4187311e-03,  5.0559820e-04,
         3.9020044e-04,  2.1625794e-05,  5.4028135e-04, -5.1952229e-04,
        -8.4310919e-03,  2.5066196e-03, -4.6499725e-03, -5.1617906e-03,
         2.5402696e-03, -3.3760569e-03,  1.5057880e-03, -4.2080422e-04,
         2.8520559e-03,  6.0501909e-03, -3.1117396e-03,  3.2398889e-03,
        -1.9732271e-03, -4.8452956e-03, -1.1368145e-03,  2.1492979e-03,
        -6.5347989e-04, -8.7599270e-04,  3.7420907e-03,  3.3858130e-03,
         1.7392814e-04,  1.0918990e-03,  3.5230049e-03, -1.9903148e-03,
        -3.0583872e-03, -5.3280089e-03,  1.8078005e-03,  1.2463904e-03,
         2.2062154e-03,  1.5298416e-03,  4.6547242e-03, -9.11605