# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['apps', 'class', 'varaya', 'elaya']"
1,"['check', 'maili', 'mailed', 'varma', 'kept', ..."
2,"['meh', 'thgt', 'clash', 'really', 'ah', 'dun'..."
3,"['call', '09094100151', 'use', 'ur', 'mins', '..."
4,"['mostly', 'sports', 'typelyk', 'footblcrckt']"


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
# woth the index the vectorizer stores the tokens
tfidf_vect.vocabulary_

{'apps': 1125,
 'class': 1969,
 'varaya': 7717,
 'elaya': 2738,
 'check': 1896,
 'maili': 4624,
 'mailed': 4623,
 'varma': 7720,
 'kept': 4173,
 'copy': 2160,
 'regarding': 6056,
 'membershiptake': 4753,
 'careinsha': 1790,
 'allah': 997,
 'meh': 4740,
 'thgt': 7298,
 'clash': 1968,
 'really': 5999,
 'ah': 948,
 'dun': 2663,
 'mind': 4812,
 'seen': 6398,
 'lost': 4506,
 'weight': 7917,
 'gee': 3289,
 'call': 1723,
 '09094100151': 241,
 'use': 7679,
 'ur': 7662,
 'mins': 4821,
 'calls': 1747,
 'cast': 1816,
 '10pmin': 273,
 'mob': 4867,
 'vary': 7721,
 'service': 6439,
 'provided': 5858,
 'aom': 1095,
 'gbp5month': 3283,
 'box61m60': 1570,
 '1er': 364,
 'stop': 6930,
 'ages': 941,
 '18': 352,
 'mostly': 4921,
 'sports': 6830,
 'typelyk': 7566,
 'footblcrckt': 3118,
 'huh': 3781,
 'cant': 1770,
 'go': 3352,
 'house': 3745,
 'empty': 2761,
 'handed': 3520,
 'right': 6179,
 'aight': 962,
 'fuck': 3215,
 'ill': 3849,
 'get': 3308,
 'later': 4315,
 'tell': 7206,
 'shola': 6520,
 'please': 56

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8303 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

8303 element vector where only 11 of them are non-zero! Very sparse vector

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.831 / Accuracy: 0.978
