In [1]:
import nltk
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [2]:
data = pd.read_csv('SMSSpamCollection', sep= '\t')
data.columns = ['label','body_text']

In [3]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

data['body_len'] = data['body_text'].apply(lambda x:len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x:count_punct(x))

In [4]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['body_text','body_len','punct%']], data['label'],test_size = 0.2)

In [7]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

In [9]:
#these both contain same number of columns as both have been created using train set
#test set words are not included in either

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])
                                      
X_train_vect= pd.concat([X_train[['body_len','punct%']].reset_index(drop=True),
           pd.DataFrame(tfidf_train.toarray())],axis=1)
                                      
X_test_vect= pd.concat([X_test[['body_len','punct%']].reset_index(drop=True),
           pd.DataFrame(tfidf_test.toarray())],axis=1)
                                      
X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7167,7168,7169,7170,7171,7172,7173,7174,7175,7176
0,34,2.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32,18.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,133,10.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#note than compared to previour vect having over 8k words, this one has only 7k words. 
#this is because this vect is built only on the train data, while the prev was built on comb of train n test

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [17]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=2)

start = time.time()
rf_model= rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start);

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
predict_time = (end - start);

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit Time: {} / Predict Time: {} ---- Precision: {} / Recall: {} / Accuracy: {} '.format(round(fit_time,3),round(predict_time,3),round(precision,3), round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))

Fit Time: 7.616 / Predict Time: 0.162 ---- Precision: 1.0 / Recall: 0.778 / Accuracy: 0.968 


In [18]:
gb = GradientBoostingClassifier(n_estimators=150,max_depth=11)

start = time.time()
gb_model= gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start);

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
predict_time = (end - start);

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit Time: {} / Predict Time: {} -------- Precision: {} / Recall: {} / Accuracy: {} '.format(round(fit_time,3),round(predict_time,3),round(precision,3), round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))

Fit Time: 281.135 / Predict Time: 0.164 -------- Precision: 0.918 / Recall: 0.827 / Accuracy: 0.964 
