In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy


In [2]:
import pandas as pd
import numpy as np

#load the dataset
data = pd.read_table('SMSSpamCollection', header = None, encoding = 'utf-8')

In [3]:
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
#check class distribution
classes = data[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [5]:
#Now preprocess the data
#convert class label to binary values/numeric values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])


0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [6]:
# now store the text messages data
text_messages = data[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
#now we need to get rid of insignificant data in the msgs that wont help in classification
#like emailaddress, weburl, phone numbers, or any numbers or money symbols

#processed = [re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr', x) for x in text_messages]

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

In [8]:

#processed = [re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'weburl', x) for x in processed]
#processed = [re.sub(r'£|\$', 'moneysymb', x) for x in processed]
#processed = [re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber', x) for x in processed]
#processed = [re.sub(r'\d+(\.\d+)?', 'number', x) for x in processed]

processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'weburl')
processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')



In [9]:
#get rid of punctuation
#processed = [re.sub(r'[^\w\d\s]', '', x) for x in processed]

processed= processed.str.replace(r'[^\w\d\s]', ' ')

#replace whitespaces between words with single space
#processed = [re.sub(r'\s+', ' ', x) for x in processed]
processed= processed.str.replace(r'\s+', ' ')

#leadng and training white space\
#processed = [re.sub(r'^\s+|\s+?$', ' ', x) for x in processed]
processed= processed.str.replace(r'^\s+|\s+?$', '')

In [10]:
#changing the words to lower case
processed= processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been number wee...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile number months or more u r enti...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from number to number ...
12      urgent you have won a number week free members...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [11]:
#remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
#stemming
ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [13]:

print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl number week word back like fu...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil number month u r entitl updat latest col...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash number number number pound ...
12      urgent number week free membership moneysymbnu...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [14]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

#creating bag of words model
all_words= []
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)

print(len(all_words))
print(all_words.most_common(15))
        

[nltk_data] Downloading package punkt to /home/nbuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
6574
[('number', 2759), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [15]:
#use 1500 most common words as features
word_features = list(all_words.keys())[:1500]
print (word_features)

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat', 'ok', 'lar', 'joke', 'wif', 'u', 'oni', 'free', 'entri', 'number', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', 'numberst', 'may', 'text', 'receiv', 'question', 'std', 'txt', 'rate', 'c', 'appli', 'numberovernumb', 'dun', 'say', 'earli', 'hor', 'alreadi', 'nah', 'think', 'goe', 'usf', 'live', 'around', 'though', 'freemsg', 'hey', 'darl', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'xxx', 'chg', 'send', 'moneysymbnumb', 'rcv', 'even', 'brother', 'speak', 'treat', 'aid', 'patent', 'per', 'request', 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', 'copi', 'friend', 'winner', 'valu', 'network', 'custom', 'select', 'receivea', 'prize', 'reward', 'claim', 'call', 'code', 'klnumber', 'valid', 'hour', 'mobil', 'month', 'r', 'entitl', 'updat', 'latest', 'colour', 'camera', 'co', 'gon', 'na', 'home', 'soon', 'wa

In [16]:
#define a find feature method

print(len(word_features))
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = word in words
    return features

#lets observe in a example
features = find_features(processed[0])

for key,value in features.items():
    if value == True:
        print(key)
        

    

1500
go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [17]:
processed[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [18]:
#now find features of all the messages
messages = zip(processed, Y)

#define a seed for reproducibility
seed = 1
#np.random.seed = seed
#np.random.shuffle(messages)

#call find_features method for all the msgs
featuresets = [(find_features(text), label) for (text, label) in messages]

In [19]:
#now split trainign and testing data
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

print(len(training))
print(len(testing))

4179
1393


In [20]:
#scikit learn classfiers with nltk

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [21]:
#defining names of the models

names = ['K nearest neighbors', 'Decision tree', 'Random Forest', 'Logistic regression', 'SGD Classifier', 'Naive bayes', 'linear svm']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(max_iter = 100),
    SGDClassifier(),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifier)
print(models)

<zip object at 0x7f8319a62d88>


In [22]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)
    print('{}:{}'.format(name, accuracy))

K nearest neighbors:0.9440057430007178
Decision tree:0.9770279971284996




Random Forest:0.9842067480258435




Logistic regression:0.9892318736539842




SGD Classifier:0.9863603732950467
Naive bayes:0.9856424982053122
linear svm:0.9870782483847811


In [24]:
#ensemble method - voting classifier

from sklearn.ensemble import VotingClassifier

names = ['K nearest neighbors', 'Decision tree', 'Random Forest', 'Logistic regression', 'SGD Classifier', 'Naive bayes', 'linear svm']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(max_iter = 100),
    SGDClassifier(),
    MultinomialNB(),
    SVC(kernel = 'linear')
]
models = zip(names, classifier)
models = list(models)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators= models, voting = 'hard', n_jobs = -1 ))#-1 staes use all availble cores
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)
print(accuracy)


0.9885139985642498


In [25]:
#make class labels and prediction for testing set
txt_features, labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)


In [26]:
#print a confusion matrix and classification  report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.98      0.93      0.96       185

   micro avg       0.99      0.99      0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1205,3
actual,spam,13,172
