## Building ML Classifiers: Model Selection

#### Read & clean text

In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 100)
%matplotlib inline

In [7]:
stopword = stopwords.words('english')
ps = PorterStemmer()

data = pd.read_csv('data/SMSSpamCollection.tsv', sep='\t', header=None)
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    
    return round(count/(len(text) - text.count(" ")), 3) * 100

data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))

def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    
    return text

In [8]:
data.head()

Unnamed: 0,label,body_text,punct%,body_len
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5,160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7,128
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1,49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2,62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1,28


#### Split into train/test

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

#### Vectorize text

In [10]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7353,7354,7355,7356,7357,7358,7359,7360,7361,7362
0,21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,81,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,116,10.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,73,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,48,8.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Final Evaluation of models

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [12]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print(f"Fit Time: {round(fit_time, 3)} / Predict Time: {round(pred_time, 3)} ---- Precision: {round(precision, 3)}, Recall: {round(recall, 3)}, Accuracy: {round(sum(y_pred==y_test)/len(y_pred), 3)}")

Fit Time: 9.881 / Predict Time: 0.388 ---- Precision: 1.0, Recall: 0.833, Accuracy: 0.978


In [13]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print(f"Fit Time: {round(fit_time, 3)} / Predict Time: {round(pred_time, 3)} ---- Precision: {round(precision, 3)}, Recall: {round(recall, 3)}, Accuracy: {round(sum(y_pred==y_test)/len(y_pred), 3)}")

Fit Time: 321.684 / Predict Time: 0.227 ---- Precision: 0.947, Recall: 0.84, Accuracy: 0.972
