### SPAM Ham Detection

In [1]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
## Reading the given dataset
spam = pd.read_csv("../SMSSpamCollection.txt", sep = "\t", names=["label", "message"], encoding="utf-8")

In [3]:
print(spam.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set = []
for index,row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [5]:
print(data_set[:5])

[(u'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', u'ham'), (u'Ok lar... Joking wif u oni...', u'ham'), (u"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", u'spam'), (u'U dun say so early hor... U c already then say...', u'ham'), (u"Nah I don't think he goes to usf, he lives around here though", u'ham')]


In [6]:
print(len(data_set))

5572


### Preprocessing

In [29]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize("happily", pos='v')

'happily'

In [8]:
def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [9]:
## - Performing the preprocessing steps on all messages
messages_set = []
for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [10]:
print(messages_set[:5])

[([u'jurong', u'point', u'crazy..', u'available', u'bugis', u'great', u'world', u'buffet', u'...', u'cine', u'get', u'amore', u'wat', u'...'], u'ham'), ([u'lar', u'...', u'joke', u'wif', u'oni', u'...'], u'ham'), ([u'free', u'entry', u'wkly', u'comp', u'win', u'cup', u'final', u'tkts', u'21st', u'may', u'2005.', u'text', u'87121', u'receive', u'entry', u'question', u'std', u'txt', u'rate', u'apply', u'08452810075over18'], u'spam'), ([u'dun', u'say', u'early', u'hor', u'...', u'already', u'say', u'...'], u'ham'), ([u'nah', u"n't", u'think', u'usf', u'live', u'around', u'though'], u'ham')]


### Preparing to create features

In [11]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [12]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [13]:
## - creating the word features for the entire dataset
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

8395


### Preparing to create a train and test set

In [14]:
## - creating slicing index at 80% threshold
sliceIndex = int((len(messages_set)*.8))

In [15]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(messages_set)

In [16]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [17]:
len(train_messages)
len(test_messages)

1115

### Preparing to create feature maps for train and test data

In [18]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [19]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [20]:
print(training_set[:5])

[({u'contains(ffffuuuuuuu)': False, u'contains(waste)': False, u'contains(andre)': False, u'contains(my-tone.com/enjoy)': False, u'contains(dha)': False, u'contains(upd8)': False, u'contains(2channel)': False, u'contains(rent)': False, u'contains(tue)': False, u'contains(oops)': False, u'contains(8007)': False, u'contains(sender)': False, u'contains(everywhere)': False, u'contains(09061749602)': False, u'contains(case)': False, u'contains(542)': False, u'contains(s..antha)': False, u'contains(mon.l8rs.x)': False, u'contains(slap)': False, u'contains(ros)': False, u'contains(alright.okay)': False, u'contains(ready)': False, u'contains(wiv)': False, u'contains(withdraw)': False, u'contains(stone..)': False, u'contains(venugopal)': False, u'contains(kiefer.com)': False, u'contains(rodds1)': False, u'contains(-xx)': False, u'contains(elections)': False, u'contains(inmind)': False, u'contains(gnt)': False, u'contains(proove)': False, u'contains(postpone)': False, u'contains(*adoring)': Fals

In [21]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

('Training set size : ', 4457)
('Test set size : ', 1115)


### Training

In [22]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [23]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.993044648867


In [24]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.980269058296


In [25]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

('Classification result : ', u'spam')


In [26]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    202.9 : 1.0
          contains(code) = True             spam : ham    =    119.2 : 1.0
         contains(nokia) = True             spam : ham    =    101.6 : 1.0
         contains(await) = True             spam : ham    =     98.3 : 1.0
        contains(urgent) = True             spam : ham    =     95.9 : 1.0
      contains(delivery) = True             spam : ham    =     81.6 : 1.0
           contains(txt) = True             spam : ham    =     80.2 : 1.0
      contains(landline) = True             spam : ham    =     74.0 : 1.0
       contains(private) = True             spam : ham    =     73.2 : 1.0
     contains(statement) = True             spam : ham    =     64.8 : 1.0
       contains(service) = True             spam : ham    =     62.2 : 1.0
         contains(final) = True             spam : ham    =     60.6 : 1.0
         contains(video) = True             spam : ham    =     59.0 : 1.0

In [27]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

('Classifier stored at ', 'nb_spam_classifier.pickle')
