# Compare NLP Techniques: Build Model On word2vec Vectors

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read In Cleaned Text

In [2]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_test.csv')

### Create word2vec Vectors

In [3]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)



In [6]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [7]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [8]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       ...,
       [-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253]], dtype=float32)

In [9]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-4.9453876e-03,  2.1069623e-03,  5.1505500e-03,  7.2270902e-03,
       -1.5574884e-04, -6.5990998e-03,  3.5389131e-03,  7.3771477e-03,
       -3.8288585e-03, -5.0779991e-03,  3.1310169e-03, -5.2619129e-03,
       -5.1166494e-03,  6.8543176e-03, -3.8050584e-04,  3.1158114e-03,
        5.0167623e-03,  4.5585292e-03, -5.8328179e-03, -4.6013575e-03,
        4.6044630e-03, -1.6026212e-04,  7.6473504e-03, -5.0304239e-03,
        6.5765725e-03,  4.1788317e-05, -3.1208179e-03,  5.0210906e-03,
       -4.3677855e-03,  1.8716170e-03,  2.0210170e-03, -2.8022612e-03,
        4.0085968e-03, -6.4336057e-03,  1.0396446e-03,  6.3918688e-04,
        7.4303313e-03,  6.3200854e-04,  5.2233450e-03,  2.8974393e-03,
       -5.7213179e-05, -1.5364145e-03, -8.9757377e-03, -2.1903310e-03,
       -1.7067768e-03,  4.1713626e-03, -2.4315965e-04,  3.5272492e-03,
        3.0887334e-03,  5.1732687e-03,  5.5598543e-04, -3.1438298e-03,
       -1.9925972e-03,  2.2670783e-03,  3.3508425e-03,  2.5436135e-03,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [10]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [11]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [12]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

# Here we got very less scores as compared to tfidf

Precision: 0.5 / Recall: 0.2 / Accuracy: 0.865
