# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

### Create word2vec Vectors

In [2]:
w2v_model = gensim.models.Word2Vec(X_train,
                      size=100,
                      window=5,
                      min_count=2)

In [5]:
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

In [6]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [7]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-4.8747859e-03,  1.0450855e-03,  8.4496959e-04, -2.4805898e-03,
        -1.8162148e-04,  2.9147598e-03, -1.2816397e-03, -4.5036725e-03,
        -4.5552329e-04, -1.9987891e-03,  1.7698774e-03, -1.5445262e-04,
         2.9852448e-03, -4.4254721e-03, -2.1017578e-03,  2.3671833e-03,
         4.0191067e-03,  2.9395488e-03,  4.9375365e-03, -1.4061290e-03,
        -4.2079156e-03, -3.5333538e-03, -3.9687064e-03,  2.1011347e-03,
         1.8148815e-03, -4.3046875e-03,  2.6295290e-04,  2.9431267e-03,
         3.5064170e-04,  2.2238095e-03, -1.7183842e-05, -4.3107194e-04,
         4.7508013e-03, -7.6524587e-04, -4.4820840e-03, -1.9745997e-03,
        -4.3461127e-03,  1.3274476e-03,  4.2872648e-03,  3.8227107e-04,
        -2.2520251e-03, -2.9951483e-03,  2.4667087e-03,  3.0573134e-04,
        -1.6365118e-03,  4.3953699e-03, -1.7620773e-03,  2.0265377e-03,
         1.4447393e-03,  1.3479189e-03,  3.6470105e-03, -6.9066574e-04,
        -3.3133372e-03, -3.4849332e-03, -2.1703471e-03,  1.13722

In [8]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-4.8747859e-03,  1.0450855e-03,  8.4496959e-04, -2.4805898e-03,
       -1.8162148e-04,  2.9147598e-03, -1.2816397e-03, -4.5036725e-03,
       -4.5552329e-04, -1.9987891e-03,  1.7698774e-03, -1.5445262e-04,
        2.9852448e-03, -4.4254721e-03, -2.1017578e-03,  2.3671833e-03,
        4.0191067e-03,  2.9395488e-03,  4.9375365e-03, -1.4061290e-03,
       -4.2079156e-03, -3.5333538e-03, -3.9687064e-03,  2.1011347e-03,
        1.8148815e-03, -4.3046875e-03,  2.6295290e-04,  2.9431267e-03,
        3.5064170e-04,  2.2238095e-03, -1.7183842e-05, -4.3107194e-04,
        4.7508013e-03, -7.6524587e-04, -4.4820840e-03, -1.9745997e-03,
       -4.3461127e-03,  1.3274476e-03,  4.2872648e-03,  3.8227107e-04,
       -2.2520251e-03, -2.9951483e-03,  2.4667087e-03,  3.0573134e-04,
       -1.6365118e-03,  4.3953699e-03, -1.7620773e-03,  2.0265377e-03,
        1.4447393e-03,  1.3479189e-03,  3.6470105e-03, -6.9066574e-04,
       -3.3133372e-03, -3.4849332e-03, -2.1703471e-03,  1.1372266e-03,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [9]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [10]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [None]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))