In [53]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
NLTK: 3.4.5
Scikit-learn: 0.22.1
Pandas: 1.0.1
Numpy: 1.18.1


<h3>1. Load the Dataset</h3>

In [54]:
import pandas as pd
import numpy as np

In [55]:
df = pd.read_table('SMSSpamCollection', header = None, encoding = 'utf-8')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [57]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [58]:
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


<h3>2. Preprocess the Data</h3>

In [59]:
# Convert class labels to binary values, 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [60]:
#store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [61]:
# use regular expressions to replace email adresses, urls, phone numbors and symbols

# replace email addresses with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\,]+\.[a-zA-Z]{2,3}(/\$*)?$', 'webaddress')

#  replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')

# replace 10 digit phone numbers with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}[-\.\s]\d{3}[-\.\s]\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]\d{4}|\d{3}[-\.\s]\d{4}$', 'phonenumber')

# replace normal numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

In [62]:
# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')


In [63]:
# change words to lowercase
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [64]:
# remove stop words from text messages

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [26]:
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [65]:
# remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [66]:
print(processed)

0       go until jurong point crazi avail onli in bugi...
1                                   ok lar joke wif u oni
2       free entri in number a wkli comp to win fa cup...
3             u dun say so earli hor u c alreadi then say
4       nah i don t think he goe to usf he live around...
                              ...                        
5567    thi is the numbernd time we have tri number co...
5568                      will ü b go to esplanad fr home
5569        piti wa in mood for that so ani other suggest
5570    the guy did some bitch but i act like i d be i...
5571                              rofl it true to it name
Name: 1, Length: 5572, dtype: object


In [67]:
from nltk.tokenize import word_tokenize

In [68]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nimasha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [69]:
# creating a bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [70]:
# print the local number words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6680
Most common words: [('i', 3020), ('number', 2757), ('to', 2251), ('you', 2245), ('a', 1448), ('the', 1339), ('u', 1207), ('and', 979), ('it', 978), ('in', 903), ('is', 896), ('me', 807), ('my', 767), ('for', 709), ('your', 705)]


In [88]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [89]:
word_features

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'oni',
 'free',
 'entri',
 'number',
 'a',
 'wkli',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkt',
 'numberst',
 'may',
 'text',
 'receiv',
 'question',
 'std',
 'txt',
 'rate',
 't',
 'c',
 's',
 'appli',
 'numberovernumb',
 'dun',
 'say',
 'so',
 'earli',
 'hor',
 'alreadi',
 'then',
 'nah',
 'i',
 'don',
 'think',
 'he',
 'goe',
 'usf',
 'live',
 'around',
 'here',
 'though',
 'freemsg',
 'hey',
 'darl',
 'it',
 'been',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back',
 'd',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'still',
 'tb',
 'xxx',
 'chg',
 'send',
 'moneysymbnumb',
 'rcv',
 'even',
 'my',
 'brother',
 'is',
 'not',
 'speak',
 'with',
 'me',
 'they',
 'treat',
 'aid',
 'patent',
 'as',
 'per',
 'your',
 'request',
 'mell',
 'oru',
 'minnaminungi

In [102]:
# define a find_features function

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

#An example
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
until
jurong
point
crazi
avail
onli
in
bugi
n
great
world
la
e
buffet
cine
there
got
amor
wat


In [87]:
processed[0]

'go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat'

In [108]:
# find features for all messages
messages = list(zip(processed, Y))

# define a seed for responsibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS mesages
featuresets = [(find_features(text), label) for (text, label) in messages]

In [117]:
# split training and testing data sets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [118]:
print('Training: {}'.format(len(training)))
print('Testing: {}'.format(len(testing)))

Training: 4179
Testing: 1393


<h3>3. Scikit-Learn Classifier with NLTK</h3>

In [119]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [124]:
# Define models to train 
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = {
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel='linear')
}

models = list(zip(names, classifiers))

In [126]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest Neighbors: Accuracy: 98.42067480258436
Decision Tree: Accuracy: 97.20028715003589
Random Forest: Accuracy: 98.56424982053123
Logistic Regression: Accuracy: 98.20531227566404
SGD Classifier: Accuracy: 93.46733668341709
Naive Bayes: Accuracy: 97.84637473079684
SVM Linear: Accuracy: 98.1335247666906


In [130]:
# ensemble method- Voting classifier
from sklearn.ensemble import VotingClassifier

# Define models to train 
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = {
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel='linear')
}

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble Method Accuracy: {}'.format(accuracy))

Ensemble Method Accuracy: 98.56424982053123


In [131]:
# make class label prediction for testing set
text_features, labels = list(zip(*testing))

prediction = nltk_ensemble.classify_many(text_features)

In [136]:
# print a cofusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1203
           1       0.99      0.91      0.95       190

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1201,2
actual,spam,18,172
