## Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [2]:
# Load the cleaned training and testing set
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['guaranteed', '32000', 'award', 'maybe', 'eve..."
1,"['today', 'sorry', 'day', 'ever', 'angry', 'ev..."
2,"['wishing', 'wonderful', 'week']"
3,"['part', 'checking', 'iq']"
4,"['saw', 'messageit', 'k', 'da']"


In [3]:
### Create TF-IDF Vectors

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [14]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'guaranteed': 3422,
 '32000': 468,
 'award': 1245,
 'maybe': 4651,
 'even': 2799,
 '1000': 244,
 'cash': 1774,
 'claim': 1938,
 'ur': 7578,
 'call': 1689,
 'free': 3118,
 '0800': 39,
 '18': 327,
 'legitimat': 4305,
 'efreefone': 2684,
 'number': 5140,
 'wat': 7783,
 'think': 7199,
 'today': 7292,
 'sorry': 6655,
 'day': 2315,
 'ever': 2804,
 'angry': 1045,
 'misbehaved': 4767,
 'hurt': 3750,
 'plz': 5562,
 'slap': 6541,
 'urself': 7588,
 'bcoz': 1342,
 'fault': 2928,
 'im': 3801,
 'basically': 1316,
 'good': 3332,
 'wishing': 7934,
 'wonderful': 7970,
 'week': 7826,
 'part': 5384,
 'checking': 1860,
 'iq': 3930,
 'saw': 6240,
 'messageit': 4716,
 'da': 2260,
 'gonna': 3330,
 'go': 3303,
 'get': 3259,
 'tacos': 7029,
 'thanks': 7154,
 'temales': 7113,
 'thank': 7153,
 'great': 3388,
 'oh': 5200,
 'thats': 7168,
 'late': 4263,
 'well': 7844,
 'night': 5033,
 'give': 3287,
 'tomorrow': 7317,
 'iam': 3764,
 'going': 3319,
 'sleep': 6543,
 'ready': 5911,
 'moan': 4800,
 'scream': 6271,
 'a

In [16]:
# How are these vectors stored?
X_test_vect[0]

<1x8232 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [17]:
# Can we convert the vectors to array?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RF Classifier On Top Of Vectors

In [22]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [23]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)
y_pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [25]:
# Evaluate the prediction of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score, accuracy_score

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {round(precision, 3)} / Recall: {round(recall, 3)} / Accuracy: {round(acc, 3)}")

Precision: 1.0 / Recall: 0.774 / Accuracy: 0.969
