# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['living', 'simple', 'loving', 'also', 'simple..."
1,"['already', 'squatting', 'new', 'way', 'walking']"
2,"['sister', 'got', 'placed', 'birla', 'soft', '..."
3,"['lovely', 'smell', 'bus', 'aint', 'tobacco', '']"
4,"['yes', 'nigh', 'cant', 'aha']"


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'living': 4429,
 'simple': 6569,
 'loving': 4511,
 'also': 1022,
 'laughing': 4305,
 'winning': 7967,
 'tooo': 7404,
 'difficult': 2471,
 'already': 1016,
 'squatting': 6832,
 'new': 5049,
 'way': 7848,
 'walking': 7802,
 'sister': 6588,
 'got': 3379,
 'placed': 5576,
 'birla': 1457,
 'soft': 6685,
 'da': 2274,
 'lovely': 4503,
 'smell': 6650,
 'bus': 1677,
 'aint': 971,
 'tobacco': 7358,
 'yes': 8174,
 'nigh': 5069,
 'cant': 1763,
 'aha': 956,
 'yup': 8235,
 'remb': 6058,
 'think': 7271,
 'book': 1527,
 'sorry': 6733,
 'roommates': 6206,
 'took': 7401,
 'forever': 3107,
 'ok': 5246,
 'come': 2039,
 'b4u': 1279,
 'voucher': 7760,
 'wc': 7850,
 '2703': 427,
 'marsms': 4643,
 'log': 4447,
 'onto': 5286,
 'wwwb4utelecom': 8085,
 'discount': 2508,
 'credit': 2208,
 'opt': 5312,
 'reply': 6090,
 'stop': 6916,
 'customer': 2261,
 'care': 1775,
 'call': 1717,
 '08717168528': 125,
 'kindly': 4190,
 'send': 6403,
 'one': 5273,
 'flat': 3057,
 'ltdecimalgt': 4525,
 'today': 7362,
 'ceri': 1839,

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8264 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.796 / Accuracy: 0.973
