## Compare NLP Techniques: Build Model On word2vec

### Read In Cleaned Text

In [15]:
# Load the cleaned training and testing set
import gensim
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

# X_train and X_test giving the output as string of list
# convert them into list
X_train['clean_text'] = [eval(list_str) for list_str in X_train['clean_text']]
X_test['clean_text'] = [eval(list_str) for list_str in X_test['clean_text']]
X_train.head()

Unnamed: 0,clean_text
0,"[living, simple, loving, also, simple, laughin..."
1,"[already, squatting, new, way, walking]"
2,"[sister, got, placed, birla, soft, da]"
3,"[lovely, smell, bus, aint, tobacco, ]"
4,"[yes, nigh, cant, aha]"


In [8]:
type(X_train['clean_text'][0])

list

### Create word2vec Vectors

In [12]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train['clean_text'],
                                  size=100,
                                  window=5,
                                  min_count=2)

In [18]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) 
                for ls in X_train['clean_text']])

X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) 
                for ls in X_test['clean_text']])

In [21]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training)
X_train_vect_avg = []

for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [24]:
# What does the unaveraged version look like?
X_test_vect[0]

array([[-6.63742870e-02,  7.23408256e-03, -1.77241326e-03,
         1.42577514e-01,  1.14879288e-01, -2.79108365e-03,
        -9.97191574e-03, -6.29614815e-02, -6.98792115e-02,
         6.31046966e-02,  4.32320759e-02,  1.81948883e-03,
        -3.78283970e-02,  2.93214601e-02, -1.48964450e-01,
        -1.17131673e-01, -4.71842214e-02,  8.72963518e-02,
         1.14246886e-02, -1.35921448e-01,  2.31395867e-02,
         1.23707682e-01, -1.56036720e-01, -6.57145977e-02,
        -2.93300040e-02, -2.41330042e-02, -6.41774982e-02,
         9.63518992e-02, -9.82702151e-02,  5.31863831e-02,
         3.65774855e-02, -5.78753240e-02, -1.81809552e-02,
        -4.48351912e-03, -7.59957125e-03, -4.81474102e-02,
        -5.33521958e-02, -9.64420214e-02, -4.94535826e-02,
         1.72963008e-01, -6.11732118e-02,  5.66363446e-02,
        -6.47993237e-02,  6.09236173e-02,  8.09565187e-02,
         4.20068242e-02,  8.39865506e-02,  7.01986998e-02,
         7.72622153e-02, -1.98197123e-02, -1.93788391e-0

In [25]:
# What does the averaged version look like?
X_test_vect_avg[0]

array([-0.12562563,  0.01589814,  0.00128595,  0.26026773,  0.21272965,
       -0.00620015, -0.01442243, -0.11251124, -0.12418506,  0.11222134,
        0.07930848,  0.00134508, -0.07587984,  0.05756057, -0.27399483,
       -0.21701278, -0.08554231,  0.17032737,  0.02728606, -0.2500693 ,
        0.0476772 ,  0.2230529 , -0.29037568, -0.11756456, -0.05161769,
       -0.04810193, -0.118333  ,  0.17211159, -0.18719082,  0.09939272,
        0.06505572, -0.10360243, -0.03476333, -0.00911762, -0.01198899,
       -0.08627198, -0.10286409, -0.17436218, -0.09069259,  0.32221004,
       -0.10834464,  0.11489403, -0.12557043,  0.11843646,  0.14526561,
        0.07525893,  0.16118895,  0.13784453,  0.14060079, -0.03106487,
       -0.0371739 , -0.21350102, -0.01346765, -0.00976688, -0.13832961,
        0.0475444 ,  0.02998301, -0.12792428, -0.1360114 , -0.21667664,
        0.18274362,  0.00581198,  0.0022375 ,  0.17809486,  0.07809238,
        0.00769341, -0.02658795,  0.11105613,  0.11769236,  0.02

### Fit the RF Classifier On Top Of Words Vectors

In [26]:
# Instantiate and fit a basic Random Forest Classifier on the top of vectors
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [27]:
# USe the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [28]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import accuracy_score, precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(f"Precision: {round(precision, 3)} / Recall: {round(recall, 3)} / Accuracy: {round(acc, 3)}")

Precision: 1.0 / Recall: 0.497 / Accuracy: 0.934
