# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [2]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([ 0.00527109, -0.01324665, -0.0096915 ,  0.00193137, -0.00092878,
        0.00680627,  0.01172902, -0.00583484, -0.00340428, -0.00950179,
       -0.02687154, -0.00575173, -0.0096828 ,  0.01267449,  0.02080682,
        0.00128518, -0.0040912 ,  0.00603617,  0.00493593, -0.01808688,
       -0.02640009,  0.00496611,  0.00656763, -0.00777842, -0.0059013 ,
        0.00717304, -0.00413813, -0.00381236,  0.01665859,  0.01520532,
        0.0019713 , -0.00056722,  0.01056363, -0.01979274, -0.00897815,
       -0.00242522,  0.00242939, -0.00712625, -0.00102019, -0.00054327,
       -0.01101292,  0.01834851, -0.0149589 , -0.03397872, -0.00235777,
        0.02958577, -0.00066346,  0.00220888,  0.01940633,  0.00222694],
      dtype=float32)

In [3]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [4]:
vectors[0]

[array([ 0.00024286,  0.00290922, -0.0019285 ,  0.00167686, -0.0027719 ,
        -0.00435879,  0.02320793, -0.00106661,  0.01115903, -0.02428407,
        -0.01739795, -0.01271261, -0.01628193,  0.00321393,  0.01945039,
         0.00528758, -0.01312716, -0.00535798, -0.00218258, -0.01229463,
        -0.03317567, -0.00173265,  0.00933457, -0.0108058 , -0.00857016,
         0.0089884 , -0.00120885, -0.00144213,  0.00501392,  0.01367759,
         0.00675126,  0.005638  , -0.00581592, -0.01293811, -0.00633355,
        -0.01342575,  0.00457624,  0.00598609, -0.0045976 ,  0.00517329,
        -0.02318596,  0.02657969, -0.00663634, -0.02737018, -0.00637147,
         0.0205109 , -0.00754566, -0.00627113,  0.00515462,  0.00373692],
       dtype=float32)]