In [10]:
import pandas as pd
import re
import string
import time
import nltk
from nltk import corpus
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
ps = nltk.PorterStemmer()
nw = nltk.WordNetLemmatizer()
nltk.download('stopwords')
pd.set_option('display.max_colwidth',150)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
rawData = pd.read_csv('/content/drive/MyDrive/Jul_2023/SMSSpamCollection.tsv',header=None,sep='\t')
rawData.columns = ['label','text_body']
rawData.head()

Unnamed: 0,label,text_body
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [None]:
# feature eng
rawData['body_len'] = rawData['text_body'].apply(lambda x: len(x)-x.count(' '))
rawData['%punct'] = rawData['text_body'].apply(lambda x: round((sum([1 for i in x if i in string.punctuation])) / (len(x)-x.count(' ')),3)*100)

In [None]:
rawData.head()

Unnamed: 0,label,text_body,body_len,%punct
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [None]:
# pre-processing
def clean(text):
  punct = " ".join([text.lower() for i in text if text not in string.punctuation])
  tokens = re.split('\W+',punct)
  text = [ps.stem(w) for w in tokens if w not in corpus.stopwords.words('english')]
  return text

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(rawData[['text_body','body_len','%punct']],rawData['label'], test_size=0.2, random_state=100)

In [None]:
# vectorize the text
# tf-idf Vectorizer
tf_idf = TfidfVectorizer(analyzer =  clean)
tf_idf_fit = tf_idf.fit(X_train['text_body'])

tf_idf_xtrain = tf_idf_fit.transform(X_train['text_body'])
tf_idf_xtest = tf_idf_fit.transform(X_test['text_body'])

x_train_vector = pd.concat([X_train[['body_len','%punct']].reset_index(drop=True),pd.DataFrame(tf_idf_xtrain.toarray())], axis=1)
x_test_vector = pd.concat([X_test[['body_len','%punct']].reset_index(drop=True),pd.DataFrame(tf_idf_xtest.toarray())], axis=1)

In [8]:
x_train_vector.head()

Unnamed: 0,body_len,%punct,0,1,2,3,4,5,6,7,...,6447,6448,6449,6450,6451,6452,6453,6454,6455,6456
0,21,4.8,0.008703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,35,2.9,0.002896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,12.5,0.005502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26,11.5,0.005778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60,13.3,0.00141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
x_train_vector.columns = x_train_vector.columns.astype(str)
x_test_vector.columns = x_test_vector.columns.astype(str)

In [19]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None,n_jobs=-1)
start = time.time()
model = rf.fit(x_train_vector,y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
ypred = rf.predict(x_test_vector)
end = time.time()
predict_time = (end - start)

precision, recall, fscore, train_support = score(y_test, ypred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(predict_time, 3), round(precision, 3), round(recall, 3), round((ypred==y_test).sum()/len(ypred), 3)))

Fit time: 7.155 / Predict time: 0.166 ---- Precision: 1.0 / Recall: 0.869 / Accuracy: 0.985


In [22]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(x_train_vector, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(x_test_vector)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 187.298 / Predict time: 0.113 ---- Precision: 0.917 / Recall: 0.931 / Accuracy: 0.982
