In [11]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from collections import Counter
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report
from itertools import chain

[nltk_data] Downloading package stopwords to /home/fiona/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fiona/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = []
with open('SmsCollection.csv','r') as file:
    for line in file:
        label, text = line.split(';', 1)
        data.append({'label' : label, 'text': text})

df = pd.DataFrame(data)
df.label.value_counts()

ham      4827
spam      747
label       1
Name: label, dtype: int64

In [3]:
df = df[df.label != 'label']
df.label.value_counts()
df_spam = pd.DataFrame(df[df.label == 'spam'])
df_ham = pd.DataFrame(df[df.label == 'ham'])
df.label = df.label.map({'ham':0, 'spam':1})

In [4]:
def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if len(t)>1]
    stop_word_list = nltk.corpus.stopwords.words('english')
    tokens = [t for t in tokens if t not in stop_word_list]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens
    
spam_words = []
for text in df_spam.text:
    spam_words.extend(preprocess(text))
    
ham_words = []
for text in df_ham.text:
    ham_words.extend(preprocess(text))

all_word_count_dicts = []
for text in df.text:
    all_word_count_dicts.append(Counter(preprocess(text)))
    
all_words = spam_words + ham_words

In [5]:
word_count_dict = Counter(all_words)
sorted(word_count_dict.items(), key=lambda x:x[1], reverse=True)
all_tokens = word_count_dict.keys()
all_tokens

dict_keys(['ac/smsreward', 'centuri', '18p/txt', 'ktv', 'hello-', '09050280520', 'kochi', 'eldest', 'crowd', 'juliana', '20', 'sparkl', 'pataistha', 'date', 'badli', 'dai', 'hooch', 'inch', 'sch', 'peopl', 'soladha', 'lunch', 'okie..', 'master', 'septemb', '89080', 'bluetooth', 'howdi', 'accid', 'girl..', 'inc', 'rakhesh', '*possess', 'yes.h', 'calls£1/minmoremobsemspobox45po139wa', 'bead', '£900', 'tiempo', 'aight', 'shelv', 'aiyar', 'risk', 'uwant', 'effect', 'swing', 'railway', '86021', 'kiefer.com', 'voicemail', '=d', 'boytoy', 'name..', 'is-', 'paranoid', 'ro', 'uawak', 'approx', 'interview', 'desert', 'employ', 'www.sms.ac/u/natalie2k9', 'obvious', 'there.goodnight', 'clock', 'furnitur', 'prove', 'henc', 'available..i', 'rr..', 'spin', 'dabook', 'wrote', 'oz', 'progress', '140', '£1.50/msg', 'csh11', 'lido', '150p/text', 'theyr', 'jen', 'elect', 'urgran', 'good.environ', '63mile', 'hmmm', '*teas', 'lavend', 'slove', 'plyr', 'durham', 'teletext', 'unjalur', 'somon', 'pete', 'publi

In [6]:
dv = DictVectorizer(sparse=True)
all_features = dv.fit_transform(all_word_count_dicts)


# split dataset into train and test set (80% and 20%)
indices = np.arange(df.shape[0])
features_train, features_test, target_train, target_test, i_train, i_test = model_selection.train_test_split(all_features, df.label, indices, test_size=0.2, random_state=0, shuffle=True)

features_train = all_features[i_train,:]
features_test = all_features[i_test,:]


In [7]:
clf = MultinomialNB()
clf.fit(features_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
prediction = clf.predict(features_test)

In [13]:
print(classification_report(target_test, prediction))

             precision    recall  f1-score   support

          0       0.99      0.98      0.98       950
          1       0.87      0.95      0.91       165

avg / total       0.97      0.97      0.97      1115

