In [1]:
import pandas as pd
import numpy as np

# load the dataset of SMS message
df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')

In [2]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()
df.size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


11144

In [4]:
# Check class distributiion
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values, 0= ham and spam = 1
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [6]:
# store the SMS message data
text_message = df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
# Use regular expression to replace email address, URLs, phone number, other numbers

# Replace email address with "email"
processed = text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

# Replace URLs with "webaddress"

processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

#Replace money symbol with "moneysymb"

processed = processed.str.replace(r'£|\$', 'moneysymb')

#Replace 10 digit phone numbers
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')

# Replace number with 'number'

processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


In [8]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace btw terms with a single space
processed = processed.str.replace(r'\s+', ' ')

#Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', ' ')

In [9]:
processed = processed.str.lower()

In [10]:
print(processed.head())

0    go until jurong point crazy available only in ...
1                             ok lar joking wif u oni 
2    free entry in numbr a wkly comp to win fa cup ...
3         u dun say so early hor u c already then say 
4    nah i don t think he goes to usf he lives arou...
Name: 1, dtype: object


In [11]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()
#remove stop words from text message
#remove word stems using a Porter stemmer

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split() if term not in stop_words))

## Generating Features

In [12]:
from nltk.tokenize import word_tokenize
import nltk
# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

In [13]:
print(f'Number of words: {len(all_words)}')
print(f'Most common words: {all_words.most_common(15)}')

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [14]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [15]:
# The find_features fucntion will determine which of the 1500 word features are contained in the review

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

# Lets see an example
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [16]:
# Now lets do it for all the messages
messages = list(zip(processed, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call split the featuresets into training and testing datasets using sklearn
featuresets = [(find_features(text), label) for (text, label) in messages]


from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [17]:
print(len(training))
print(len(testing))

4179
1393


## Scikit-Learn Classifiers with NLTK

In [19]:
# we can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test in the testing dataset
accuracy = nltk.classify.accuracy(model, testing)*100
print(f"SVC Accuracy: {accuracy}")

SVC Accuracy: 98.27709978463747


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model  import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [21]:
# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print(f"{name} Accuray: {accuracy}")

K Nearest Neighbors Accuray: 93.96984924623115
Decision Tree Accuray: 97.20028715003589




Random Forest Accuray: 97.91816223977028




Logistic Regression Accuray: 98.49246231155779




SGD Classifier Accuray: 98.06173725771716
Naive Bayes Accuray: 98.42067480258436
SVM Linear Accuray: 98.27709978463747


In [31]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.27709978463747


In [32]:
# make class label prediction for testing set
txt_features, labels = list(zip(*testing))

prediction = nltk_ensemble.classify_many(txt_features)

In [33]:
print(classification_report(labels, prediction))

pd.DataFrame(
            confusion_matrix(labels, prediction),
            index = [['actual', 'actual'], ['ham', 'spam']],
            columns = [['predicted', 'predicted'], ['ham', 'spam']]
)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1195
           1       0.99      0.91      0.95       198

   micro avg       0.99      0.99      0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1193,2
actual,spam,18,180
