# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['cool', 'text', 'head']"
1,"['aiyo', 'u', 'always', 'c', 'ex', 'one', 'dun..."
2,"['ya', 'nice', 'ready', 'thursday']"
3,"['think', 'still', 'car']"
4,"['every', 'mondaynxt', 'week', 'vl', 'completi..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{&#39;cool&#39;: 2129,
 &#39;text&#39;: 7191,
 &#39;head&#39;: 3542,
 &#39;aiyo&#39;: 970,
 &#39;always&#39;: 1021,
 &#39;ex&#39;: 2844,
 &#39;one&#39;: 5256,
 &#39;dunno&#39;: 2643,
 &#39;abt&#39;: 836,
 &#39;mei&#39;: 4686,
 &#39;reply&#39;: 6050,
 &#39;first&#39;: 3031,
 &#39;time&#39;: 7312,
 &#39;fast&#39;: 2936,
 &#39;lucky&#39;: 4513,
 &#39;workin&#39;: 8024,
 &#39;huh&#39;: 3755,
 &#39;got&#39;: 3374,
 &#39;bao&#39;: 1322,
 &#39;ur&#39;: 7622,
 &#39;sugardad&#39;: 6978,
 &#39;ahgee&#39;: 949,
 &#39;ya&#39;: 8152,
 &#39;nice&#39;: 5048,
 &#39;ready&#39;: 5931,
 &#39;thursday&#39;: 7296,
 &#39;think&#39;: 7253,
 &#39;still&#39;: 6866,
 &#39;car&#39;: 1760,
 &#39;every&#39;: 2818,
 &#39;mondaynxt&#39;: 4842,
 &#39;week&#39;: 7863,
 &#39;vl&#39;: 7740,
 &#39;completing&#39;: 2056,
 &#39;ok&#39;: 5231,
 &#39;knackered&#39;: 4189,
 &#39;came&#39;: 1734,
 &#39;home&#39;: 3668,
 &#39;went&#39;: 7886,
 &#39;sleep&#39;: 6576,
 &#39;good&#39;: 3353,
 &#39;full&#39;: 3203,
 &#39;work&#39;:

In [4]:
# How are these vectors stored?
X_test_vect[0]

&lt;1x8270 sparse matrix of type &#39;&lt;class &#39;numpy.float64&#39;&gt;&#39;
	with 5 stored elements in Compressed Sparse Row format&gt;

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.991 / Recall: 0.741 / Accuracy: 0.965
