In [36]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

In [6]:
# reading the data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['message', 'label'])

# converting the message and label to a list of tuples
data_spam = []
for index, row in df.iterrows():
    data_spam.append((row['message'], row['label']))

In [7]:
data_spam[:3]

[('ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'),
 ('ham', 'Ok lar... Joking wif u oni...'),
 ('spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")]

### Text Preprocessing

In [8]:
# Creating a function for test preprocessing
# Converting to lower case, tokenizing, removing stopwords, lemmentization/stemming

stemmer = PorterStemmer()
lemmentizer = WordNetLemmatizer()

def preprocess(document, stem=True):

    words = word_tokenize(document.lower())
    words = [w for w in words if w not in stopwords.words('english')]
    if stem == True:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [lemmentizer.lemmatize(word, pos='v') for word in words]

    doc_preprocess = ' '.join(words)

    return doc_preprocess

In [11]:
# Preprocessing all messages

data_processed = []

for label, message in data_spam:
    words = [i for i in preprocess(message, stem=True).split() if len(i) >= 3]
    data_processed.append((words, label))

data_processed[0]

(['jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'buffet',
  '...',
  'cine',
  'got',
  'amor',
  'wat',
  '...'],
 'ham')

### Feature Extraction

In [14]:
# Creating function to create a word list and then to created a frequency distribution of all words

# Get all words
def all_words(data):
    words = []
    for message, label in data:
        words.extend(message)
    return words

# Get Features from Freq Dist
def get_features(words):
    word_list = FreqDist(words)
    word_features = word_list.keys()

    return word_features

In [24]:
all_features = get_features(all_words(data_processed))
print(len(features))

7626


### Train-Test Split and Training

In [16]:
import random
random.shuffle(data_processed)

In [26]:
def extract_features(document):
    doc_words = set(document)
    features = {}
    for feature in all_features:
        features[feature] = (feature in doc_words)
    return features

In [33]:
train, test = data_processed[:int(len(data_processed)*0.8)], data_processed[int(len(data_processed)*0.8):]

In [34]:
train_set = classify.apply_features(extract_features, train)
test_set = classify.apply_features(extract_features, test)

In [37]:
# Training the model
clf = NaiveBayesClassifier.train(train_set)

### Model Evaluation

In [38]:
print("Accuracy on train set = {}".format(classify.accuracy(clf, train_set)))
print("Accuracy on test set = {}".format(classify.accuracy(clf, test_set)))

Accuracy on train set = 0.9921471842046219
Accuracy on test set = 0.979372197309417


In [39]:
# Getting feature importances
print(clf.show_most_informative_features(50))

Most Informative Features
                   nokia = True             spam : ham    =    198.2 : 1.0
                   award = True             spam : ham    =    194.0 : 1.0
                 voucher = True             spam : ham    =    134.3 : 1.0
                    code = True             spam : ham    =    108.7 : 1.0
                  camera = True             spam : ham    =    100.2 : 1.0
                   await = True             spam : ham    =     91.7 : 1.0
                  latest = True             spam : ham    =     83.1 : 1.0
                  servic = True             spam : ham    =     80.3 : 1.0
                 attempt = True             spam : ham    =     74.6 : 1.0
                  urgent = True             spam : ham    =     71.8 : 1.0
                 landlin = True             spam : ham    =     70.3 : 1.0
                  privat = True             spam : ham    =     70.3 : 1.0
                     txt = True             spam : ham    =     68.8 : 1.0

In [40]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(clf,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
