# Compare NLP Techniques: Build Model On doc2vec Vectors

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read In Cleaned Text

In [2]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_test.csv')


### Create doc2vec Vectors

In [3]:
# Created TaggedDocument vectors for each text message in the training and test sets
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i])
                     for i, v in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i, v in enumerate(X_test['clean_text'])]

In [4]:
# What do these TaggedDocument objects look like?
tagged_docs_train[:10]

[TaggedDocument(words="['let', 'know', 'details', 'fri', 'u', 'find', 'cos', 'im', 'tom', 'fri', 'mentionned', 'chinese', 'thanks']", tags=[0]),
 TaggedDocument(words="['private', '2003', 'account', 'statement', 'shows', '800', 'unredeemed', 'points', 'call', '08718738002', 'identifier', 'code', '48922', 'expires', '211104']", tags=[1]),
 TaggedDocument(words="['enjoy', 'showers', 'possessiveness', 'poured', 'u', 'ur', 'loved', 'ones', 'bcoz', 'world', 'lies', 'golden', 'gift', 'loved', 'truly']", tags=[2]),
 TaggedDocument(words="['beautiful', 'truth', 'gravity', 'read', 'carefully', 'heart', 'feels', 'light', 'someone', 'feels', 'heavy', 'someone', 'leaves', 'goodmorning']", tags=[3]),
 TaggedDocument(words="['onum', 'ela', 'pa', 'normal']", tags=[4]),
 TaggedDocument(words="['wake', 'gt']", tags=[5]),
 TaggedDocument(words="['im', 'saying', 'havent', 'explicitly', 'told', 'nora', 'know', 'someone', 'im', 'probably', 'gonna', 'bother']", tags=[6]),
 TaggedDocument(words="['yeah', 'su

In [5]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                  vector_size=100,
                                  window=5,
                                  min_count=2)



In [6]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]

### Fit RandomForestClassifier On Top Of Document Vectors

In [7]:
# Fit a basic model, make predictions on the holdout test set, and the generate the evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

y_pred = rf_model.predict(test_vectors)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.889 / Recall: 0.373 / Accuracy: 0.909
