In [1]:
import pandas as pd
msg = pd.read_csv("spam-msg.txt", sep = "\t", names=["label", "msg"])

In [2]:
print(msg.head())


  label                                                msg
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
dataset = []
for index,row in msg.iterrows():
    dataset.append((row['msg'], row['label']))

In [4]:
print(len(dataset))


5572


In [5]:
import nltk

In [6]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:

def preprocess(document, stem=True):

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [9]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [10]:
msg_set = []
for (msg, label) in dataset:
    words_filtered = [e.lower() for e in preprocess(msg, stem=False).split() if len(e) >= 3]
    msg_set.append((words_filtered, label))

In [12]:
print(msg_set[:10])


[(['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham'), (['freemsg', 'hey', 'darling', 'week', 'word', 'back', 'like', 'fun', 'still', 'xxx', 'std', 'chgs', 'send', '£1.50', 'rcv'], 'spam'), (['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'], 'ham'), (['per', 'request', "'melle", 'melle', 'oru', 'minnaminunginte', 'nurungu', 'vettam', 'set', 'callertune', 'callers', 'press', 'copy', 'friends', 'callertune'], 'ham'), (['winner', 'value', 'network', 'customer', 'select', 'receivea', '£900', 'prize', 

In [13]:
def get_words(msg):
    all_words = []
    for (msg, label) in msg:
      all_words.extend(msg)
    return all_words

In [14]:
def get_word_features(wordlist):

    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [15]:
word_features = get_word_features(get_words(msg_set))
print(len(word_features))

7995


In [18]:
size = int((len(msg_set)*.8))

In [19]:
import random
random.shuffle(msg_set)

In [21]:
train_set, test_set = msg_set[:size], msg_set[size:]

In [22]:
len(train_set)
len(test_set)

1115

In [23]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [24]:
training_set = nltk.classify.apply_features(extract_features, train_set)
testing_set = nltk.classify.apply_features(extract_features, test_set)

In [25]:

print(len(training_set))
print(len(testing_set))

4457
1115


In [26]:
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [27]:
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9916984518734575


In [28]:
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9856502242152466


In [29]:
my_msg = 'My name is Ahmed, how can I buy your python course?'
print('Classification result : ', spamClassifier.classify(extract_features(my_msg.split())))

Classification result :  ham


In [36]:
my_msg = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(my_msg.split())))

Classification result :  spam


In [37]:
print(spamClassifier.show_most_informative_features(50))


Most Informative Features
       contains(service) = True             spam : ham    =    248.0 : 1.0
         contains(award) = True             spam : ham    =    192.9 : 1.0
         contains(await) = True             spam : ham    =    108.1 : 1.0
        contains(urgent) = True             spam : ham    =     89.9 : 1.0
           contains(txt) = True             spam : ham    =     80.0 : 1.0
      contains(delivery) = True             spam : ham    =     74.2 : 1.0
      contains(landline) = True             spam : ham    =     72.5 : 1.0
        contains(latest) = True             spam : ham    =     72.5 : 1.0
         contains(nokia) = True             spam : ham    =     71.8 : 1.0
       contains(private) = True             spam : ham    =     65.7 : 1.0
          contains(draw) = True             spam : ham    =     60.9 : 1.0
     contains(statement) = True             spam : ham    =     57.2 : 1.0
        contains(camera) = True             spam : ham    =     54.7 : 1.0