# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [3]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

In [4]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [5]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-1.60741829e-03, -1.66540162e-03, -3.25174304e-03,
        -2.55734846e-03,  3.87599762e-03,  3.46452720e-03,
        -2.57457444e-03, -5.75640879e-04,  1.26546843e-03,
        -3.06630437e-03,  1.50255859e-03, -2.61150207e-03,
        -3.41046439e-03, -3.94531805e-03, -4.41726902e-03,
         2.68241391e-03,  1.04345183e-03, -5.82961773e-04,
         2.34960578e-03,  4.96239029e-03,  1.06768613e-03,
        -4.61770501e-03, -4.09188820e-03, -8.99009989e-04,
        -2.14931020e-03, -2.84150126e-03,  1.39768166e-03,
         2.62276991e-03,  3.15754279e-03, -8.24615709e-04,
        -2.26608943e-03, -3.11421533e-03, -2.63323169e-03,
        -4.21910966e-03,  8.12176731e-04,  8.72491626e-04,
        -3.38562881e-03, -9.11505951e-04,  1.74480805e-03,
         9.35953576e-04,  1.11797731e-03, -4.82660392e-03,
         3.81157571e-03, -2.25280784e-03,  4.91533196e-03,
        -3.87180306e-04, -3.17591126e-03,  3.12580611e-03,
        -5.96788421e-04, -1.32439134e-03,  9.41139646e-0

In [6]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([ 2.0047789e-04, -5.5593654e-04, -1.0708438e-03, -2.6599313e-03,
        3.8005027e-03,  1.8837703e-03, -1.9359417e-03, -9.0988475e-04,
        5.2755844e-04, -3.0881981e-03,  1.7102920e-03, -2.2496446e-03,
       -3.6792406e-03, -2.8155667e-03, -3.7872519e-03,  1.6139812e-03,
       -8.1926220e-05,  4.8857508e-04,  4.8878032e-04,  3.4837343e-03,
        1.2744690e-03, -2.4670293e-03, -2.0363089e-03,  2.2979137e-04,
       -1.2316998e-03, -1.4706661e-03,  1.2405916e-03,  1.3903093e-03,
        2.1335792e-03, -6.6308613e-04, -5.8479223e-04, -3.5975876e-03,
       -9.8808017e-04, -3.5379273e-03,  1.7657025e-03, -3.1081698e-04,
       -3.3806714e-03,  3.3291901e-04,  2.0405967e-03,  1.0657371e-03,
        1.5918588e-03, -4.2560836e-03,  2.8921836e-03, -2.9613615e-03,
        4.1815271e-03, -6.4165483e-04, -2.7308725e-03,  2.1904677e-03,
        9.6267386e-04, -1.1036835e-03,  7.0383586e-04, -3.8135813e-03,
       -1.3963787e-03, -4.2559468e-04, -3.5408472e-03,  7.3037029e-04,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [7]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.596 / Recall: 0.211 / Accuracy: 0.877
