# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [3]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['oh', 'send', 'address']"
1,"['take', 'exam', 'march', '3']"
2,"['xmas', 'story', 'peace', 'xmas', 'msg', 'lov..."
3,"['dont', 'talk', 'ever', 'ok', 'word']"
4,"['send', 'new', 'number']"


### Create TF-IDF Vectors

In [4]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])

X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [5]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'oh': 5162,
 'send': 6303,
 'address': 875,
 'take': 7016,
 'exam': 2794,
 'march': 4540,
 'xmas': 8051,
 'story': 6831,
 'peace': 5397,
 'msg': 4825,
 'love': 4407,
 'miracle': 4703,
 'jesus': 3973,
 'hav': 3460,
 'blessed': 1455,
 'month': 4783,
 'ahead': 934,
 'amp': 1017,
 'wish': 7890,
 'merry': 4644,
 'dont': 2516,
 'talk': 7024,
 'ever': 2770,
 'ok': 5169,
 'word': 7940,
 'new': 4969,
 'number': 5099,
 'special': 6674,
 'called': 1685,
 'mom': 4766,
 'instead': 3850,
 'fun': 3139,
 'cant': 1718,
 'pick': 5462,
 'phone': 5451,
 'right': 6067,
 'pls': 5519,
 'message': 4649,
 'lol': 4366,
 'yeah': 8081,
 'point': 5549,
 'guess': 3376,
 'fyi': 3151,
 'im': 3763,
 'usf': 7570,
 'swing': 6985,
 'room': 6107,
 'whenever': 7833,
 'haf': 3403,
 'found': 3065,
 'feel': 2907,
 'stupid': 6875,
 'da': 2230,
 'cam': 1699,
 'working': 7950,
 'santa': 6192,
 'calling': 1691,
 'would': 7970,
 'little': 4330,
 'ones': 5197,
 'like': 4291,
 'call': 1671,
 'eve': 2762,
 '09077818151': 239,
 'book

In [6]:
# How are these vectors stored?
X_test_vect[0]

<1x8174 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [7]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [8]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [9]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [10]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.835 / Accuracy: 0.977
