# Compare NLP Techniques: Build Model On TF-IDF Vectors

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read In Cleaned Text

In [3]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['let', 'know', 'details', 'fri', 'u', 'find',..."
1,"['private', '2003', 'account', 'statement', 's..."
2,"['enjoy', 'showers', 'possessiveness', 'poured..."
3,"['beautiful', 'truth', 'gravity', 'read', 'car..."
4,"['onum', 'ela', 'pa', 'normal']"


### Create TF-IDF Vectors

In [4]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [5]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'let': 4335,
 'know': 4201,
 'details': 2419,
 'fri': 3155,
 'find': 3008,
 'cos': 2137,
 'im': 3818,
 'tom': 7361,
 'mentionned': 4716,
 'chinese': 1905,
 'thanks': 7202,
 'private': 5752,
 '2003': 390,
 'account': 853,
 'statement': 6834,
 'shows': 6487,
 '800': 706,
 'unredeemed': 7591,
 'points': 5618,
 'call': 1689,
 '08718738002': 149,
 'identifier': 3796,
 'code': 1992,
 '48922': 572,
 'expires': 2867,
 '211104': 401,
 'enjoy': 2747,
 'showers': 6483,
 'possessiveness': 5659,
 'poured': 5676,
 'ur': 7615,
 'loved': 4487,
 'ones': 5272,
 'bcoz': 1345,
 'world': 8011,
 'lies': 4351,
 'golden': 3345,
 'gift': 3293,
 'truly': 7469,
 'beautiful': 1354,
 'truth': 7472,
 'gravity': 3404,
 'read': 5936,
 'carefully': 1755,
 'heart': 3554,
 'feels': 2962,
 'light': 4363,
 'someone': 6663,
 'heavy': 3562,
 'leaves': 4309,
 'goodmorning': 3362,
 'onum': 5282,
 'ela': 2705,
 'pa': 5373,
 'normal': 5123,
 'wake': 7766,
 'gt': 3438,
 'saying': 6282,
 'havent': 3532,
 'explicitly': 2870,
 'to

In [6]:
# How are these vectors stored?
X_test_vect[0]

<1x8240 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [7]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [8]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel()) # converting the labels as vetors - i.e the y_train

In [9]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [10]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

# Precision -- 100% identified spam as spam
# Recall -- out of all the spam, 76% the model identified as spam
# Accuracy -- Spam or not prediction is correct for 96%


Precision: 1.0 / Recall: 0.767 / Accuracy: 0.969
