# doc2vec: How To Prep Document Vectors For Modeling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Train Our Own Model

In [2]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [3]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([-0.0078297 , -0.00277094,  0.01513362,  0.00480509, -0.0215268 ,
       -0.02120888,  0.02670061,  0.05044198, -0.0477617 , -0.02275492,
        0.00566442, -0.04002783,  0.00234667,  0.00883718, -0.03751437,
        0.01065594,  0.03408942,  0.0048598 , -0.03334651, -0.03162771,
       -0.00119622,  0.04944837,  0.03699635, -0.00558524,  0.03139228,
        0.0174937 , -0.01291311,  0.00772637, -0.02998761, -0.00522621,
        0.0118029 ,  0.00726539,  0.00287492,  0.01703115, -0.01129664,
        0.04264465,  0.03310277, -0.00802978,  0.018839  , -0.02575333,
        0.0332091 , -0.0016696 , -0.00618301, -0.0056473 ,  0.07271693,
        0.00403074,  0.0032322 , -0.03469745,  0.01348958,  0.03071037],
      dtype=float32)

In [4]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [5]:
vectors[0]

[array([ 3.16787814e-03, -7.71031016e-03, -1.86633077e-02,  7.71376910e-03,
        -2.31629089e-02, -9.37076472e-03, -4.06177435e-03, -6.32034382e-03,
         1.18992245e-02,  1.20797409e-02,  3.06752566e-02, -7.99759346e-06,
         7.97771011e-03, -4.98431455e-03, -4.03861515e-02, -6.11662446e-03,
         3.28985252e-03, -3.21852043e-02, -3.98042947e-02, -7.48847099e-03,
         7.93356914e-03,  3.87331396e-02,  2.13802438e-02, -2.29957160e-02,
         7.96783424e-04, -4.40560561e-03,  1.40241059e-02, -1.16359601e-02,
        -4.93877195e-02, -1.81492176e-02, -1.02791714e-03, -1.32846516e-02,
        -2.41765636e-03, -9.98300966e-04, -1.22824181e-02, -9.38972831e-03,
         8.61805398e-03, -8.95181764e-03, -3.59132849e-02,  1.99643038e-02,
        -2.06348188e-02,  1.55792776e-02, -1.98515020e-02, -1.22038452e-02,
         3.14769112e-02, -1.25221517e-02, -4.82630730e-03, -8.54511838e-03,
         1.57211442e-02,  9.90661327e-03], dtype=float32)]