In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from collections import Counter
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report
from itertools import chain

[nltk_data] Downloading package stopwords to /home/fiona/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fiona/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = []
with open('SmsCollection.csv','r') as file:
    for line in file:
        label, text = line.split(';', 1)
        data.append({'label' : label, 'text': text})

df = pd.DataFrame(data)
df.label.value_counts()

ham      4827
spam      747
label       1
Name: label, dtype: int64

In [3]:
df = df[df.label != 'label']
df.label.value_counts()
df_spam = pd.DataFrame(df[df.label == 'spam'])
df_ham = pd.DataFrame(df[df.label == 'ham'])
df.label = df.label.map({'ham':0, 'spam':1})

In [4]:
def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if len(t)>1]
    stop_word_list = nltk.corpus.stopwords.words('english')
    tokens = [t for t in tokens if t not in stop_word_list]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens
    
spam_words = []
for text in df_spam.text:
    spam_words.extend(preprocess(text))
    
ham_words = []
for text in df_ham.text:
    ham_words.extend(preprocess(text))

all_word_count_dicts = []
for text in df.text:
    all_word_count_dicts.append(Counter(preprocess(text)))
    
all_words = spam_words + ham_words

In [5]:
word_count_dict = Counter(all_words)
sorted(word_count_dict.items(), key=lambda x:x[1], reverse=True)
all_tokens = word_count_dict.keys()

In [6]:
# top 10 spam words
sorted(Counter(spam_words).items(), key=lambda x:x[1], reverse=True)[:10]

[('call', 365),
 ('free', 217),
 ('txt', 169),
 ('ur', 144),
 ('text', 139),
 ('mobil', 135),
 ('stop', 118),
 ('claim', 115),
 ('repli', 112),
 ('prize', 94)]

In [7]:
# top 10 ham words
sorted(Counter(ham_words).items(), key=lambda x:x[1], reverse=True)[:10]

[('...', 1251),
 ("'s", 422),
 ('go', 417),
 ("'m", 387),
 ('get', 359),
 ("n't", 349),
 ('gt', 318),
 ('lt', 316),
 ('come', 295),
 ('call', 290)]

In [8]:
dv = DictVectorizer(sparse=True)
all_features = dv.fit_transform(all_word_count_dicts)


# split dataset into train and test set (80% and 20%)
indices = np.arange(df.shape[0])
features_train, features_test, target_train, target_test, i_train, i_test = model_selection.train_test_split(all_features, df.label, indices, test_size=0.2, random_state=0, shuffle=True)

#features_train = all_features[i_train,:]
#features_test = all_features[i_test,:]


In [17]:
print('training set: ', len(target_train), sum(target_train), sum(target_train)/len(target_train))
print('test set: ', len(target_test), sum(target_test), sum(target_test)/len(target_test))

training set:  4459 582 0.130522538685804
test set:  1115 165 0.14798206278026907


In [10]:
# parameter tuning using grid search and cross validation
tuner = model_selection.GridSearchCV(MultinomialNB(), {'alpha' : np.linspace(0.1,1,10)}, scoring='precision', return_train_score=True, cv=10)
tuner.fit(features_train, target_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='precision', verbose=0)

In [11]:
tuner.cv_results_

{'mean_fit_time': array([0.01436942, 0.0126523 , 0.00661471, 0.00567594, 0.00530493,
        0.00524907, 0.00528431, 0.00525479, 0.00540802, 0.00515437]),
 'mean_score_time': array([0.0034375 , 0.00350308, 0.00213473, 0.00151591, 0.00142219,
        0.00141232, 0.0014492 , 0.00140638, 0.00141349, 0.00263715]),
 'mean_test_score': array([0.88556177, 0.88142192, 0.87941549, 0.88159663, 0.88428555,
        0.89389308, 0.89947361, 0.89781943, 0.90028726, 0.90312546]),
 'mean_train_score': array([0.97586923, 0.97542858, 0.97500626, 0.97662763, 0.97829863,
        0.97979108, 0.97996624, 0.97975262, 0.97972841, 0.98100418]),
 'param_alpha': masked_array(data=[0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6,
                    0.7000000000000001, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.1},
  {'alpha': 0.2},
  {'alpha': 0.3000000000000

In [12]:
clf = MultinomialNB()
clf.fit(features_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
prediction = clf.predict(features_test)
print(classification_report(target_test, prediction))

             precision    recall  f1-score   support

          0       0.99      0.98      0.98       950
          1       0.87      0.95      0.91       165

avg / total       0.97      0.97      0.97      1115

