# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [3]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])

X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

In [4]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [7]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-3.4339726e-03,  1.1271703e-03, -2.1716256e-03,  3.8697599e-03,
         2.0796976e-03, -3.7566703e-03, -2.4628628e-03,  4.6755527e-03,
        -2.9809331e-03,  1.1741939e-03,  5.9706648e-04, -2.7762628e-03,
         4.0786848e-03, -2.2604978e-03, -4.8266775e-03,  3.1243439e-03,
         4.1923993e-03, -1.4267375e-03,  4.7060428e-03, -1.8215243e-03,
         7.0136937e-04, -7.0820388e-04,  4.2260974e-03, -3.1328751e-03,
        -2.2044776e-03,  4.2151543e-03, -2.4124370e-03, -2.5965099e-05,
         1.1465317e-03,  2.4335049e-03, -4.4970601e-03,  3.4307365e-03,
        -4.2886188e-04, -8.0856925e-04, -5.8710331e-05,  5.3704763e-04,
         7.2920963e-04,  2.1751348e-03,  3.4062895e-03, -1.7123544e-03,
        -4.0399046e-03, -9.9179328e-05, -3.5678656e-03, -2.5281569e-03,
         3.4440469e-03,  1.3221985e-03, -2.2755160e-03,  4.1193468e-03,
        -3.6175023e-03,  4.6725748e-03, -2.5073518e-03,  9.8373147e-04,
         3.8625207e-03, -1.5374802e-03,  9.8782335e-04, -6.55111

In [8]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-7.3699554e-04, -2.0079338e-04,  8.7197905e-04,  1.3103206e-03,
       -1.3868359e-04, -2.4423990e-03,  5.0156820e-04,  2.7916192e-03,
        6.1787455e-04,  1.4526029e-03,  1.6708563e-03, -1.6638190e-03,
        2.0764573e-03, -1.2362287e-03, -2.1229086e-03,  2.2963006e-03,
        1.7091603e-04,  4.4654473e-04,  4.2089513e-03,  1.4570432e-03,
        5.1076582e-04, -1.6721173e-03,  3.8769536e-03, -3.3431579e-03,
        5.2924710e-04,  1.4407213e-03, -3.1035086e-03,  2.0968677e-03,
       -3.9522041e-04,  2.2118201e-04, -1.5743636e-04,  1.4577687e-03,
        2.0309864e-03,  9.6449017e-04,  2.4150044e-03,  1.0119977e-03,
       -1.6011125e-03,  5.3194806e-04,  1.1971267e-03, -7.0862792e-04,
       -2.5710014e-03, -2.6034890e-04, -6.3663931e-05, -1.1735294e-03,
        1.9718301e-03, -1.1727593e-03, -1.4060214e-03,  4.4019446e-03,
       -8.6574582e-04,  4.0936954e-03,  3.2138056e-04, -1.5959484e-03,
        3.5046819e-03, -9.4402651e-04, -7.8743009e-04, -1.7912913e-04,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [9]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [10]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [11]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.465 / Recall: 0.224 / Accuracy: 0.864
