## Compare NLP Techniques: Build Model On doc2vec

### Read In Cleaned Text

In [1]:
# Load the cleaned training and testing set
import gensim
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

# X_train and X_test giving the output as string of list
# convert them into list
X_train['clean_text'] = [eval(list_str) for list_str in X_train['clean_text']]
X_test['clean_text'] = [eval(list_str) for list_str in X_test['clean_text']]
X_train.head()

Unnamed: 0,clean_text
0,"[living, simple, loving, also, simple, laughin..."
1,"[already, squatting, new, way, walking]"
2,"[sister, got, placed, birla, soft, da]"
3,"[lovely, smell, bus, aint, tobacco, ]"
4,"[yes, nigh, cant, aha]"


### Create doc2vec Vectors

In [2]:
# Cteared TaggedDcoument vectors for each text message in the training and testing sets
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(list_word, [i]) 
                     for i, list_word in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(list_word, [i]) 
                    for i, list_word in enumerate(X_test['clean_text'])]

In [3]:
# What do thest TaggedDocument Objects look like?
tagged_docs_train[0:10]

[TaggedDocument(words=['living', 'simple', 'loving', 'also', 'simple', 'laughing', 'simple', 'winning', 'tooo', 'simple', 'simple', 'difficult', ''], tags=[0]),
 TaggedDocument(words=['already', 'squatting', 'new', 'way', 'walking'], tags=[1]),
 TaggedDocument(words=['sister', 'got', 'placed', 'birla', 'soft', 'da'], tags=[2]),
 TaggedDocument(words=['lovely', 'smell', 'bus', 'aint', 'tobacco', ''], tags=[3]),
 TaggedDocument(words=['yes', 'nigh', 'cant', 'aha'], tags=[4]),
 TaggedDocument(words=['yup', 'remb', 'think', 'book', ''], tags=[5]),
 TaggedDocument(words=['sorry', 'roommates', 'took', 'forever', 'ok', 'come'], tags=[6]),
 TaggedDocument(words=['b4u', 'voucher', 'wc', '2703', 'marsms', 'log', 'onto', 'wwwb4utelecom', 'discount', 'credit', 'opt', 'reply', 'stop', 'customer', 'care', 'call', '08717168528'], tags=[7]),
 TaggedDocument(words=['kindly', 'send', 'one', 'flat', 'ltdecimalgt', 'today'], tags=[8]),
 TaggedDocument(words=['ceri', 'u', 'rebel', 'sweet', 'dreamz', 'littl

In [4]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                 vector_size=100,
                                 window=5,
                                 min_count=2)

In [5]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_test]

### Fit the RF Classifier On Top Of Document Vectors

In [6]:
# Instantiate and fit a basic Random Forest Classifier on the top of vectors
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

In [7]:
# USe the trained model to make predictions on the test data
y_pred = rf_model.predict(test_vectors)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import accuracy_score, precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(f"Precision: {round(precision, 3)} / Recall: {round(recall, 3)} / Accuracy: {round(acc, 3)}")

Precision: 0.559 / Recall: 0.129 / Accuracy: 0.872
