## Analysis of spam SMS messages (data from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/00228/))

In [1]:
import pandas as pd
import sklearn

%matplotlib inline

In [2]:
df=pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['class','text'])

In [3]:
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Split into test data and training data

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.25)

## Some Pre-processing

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

In [7]:
count_vect = CountVectorizer()

In [8]:
X_train_counts = count_vect.fit_transform(X_train)

In [9]:
list(count_vect.vocabulary_.items())[0:3]

TypeError: 'dict_items' object is not subscriptable

In [36]:
len(count_vect.vocabulary_)

7402

In [56]:
lab_bin=LabelBinarizer()
y_train_bin=lab_bin.fit_transform(y_train)
y_test_bin=lab_bin.fit_transform(y_test)

## Train

In [58]:
from sklearn.naive_bayes import MultinomialNB

In [59]:
clf = MultinomialNB().fit(X_train_counts, y_train_bin)

In [60]:
len(clf.coef_[0])

7402

In [40]:
import collections

In [61]:
importanceCount=collections.Counter()

In [62]:
for word,imp in zip(count_vect.vocabulary_.keys(),clf.coef_[0]):
    importanceCount[word]=imp

In [81]:
importanceCount.most_common()[-10:]

[(u'1yf', -9.9016859305894993),
 (u'incredible', -9.9016859305894993),
 (u'other', -9.9016859305894993),
 (u'sick', -9.9016859305894993),
 (u'conclusion', -9.9016859305894993),
 (u'june', -9.9016859305894993),
 (u'margaret', -9.9016859305894993),
 (u'kinda', -9.9016859305894993),
 (u'itcould', -9.9016859305894993),
 (u'downstem', -9.9016859305894993)]

## Now test

In [64]:
X_test_counts = count_vect.transform(X_test)

In [65]:
pred=clf.predict(X_test_counts)

In [66]:
from sklearn.metrics import average_precision_score

In [68]:
average_precision_score(y_test_bin,pred)

0.92152096592172206

## Sanity check

In [71]:
clf.predict(count_vect.transform(['win big on this offer']))

array([1])

In [72]:
clf.predict(count_vect.transform(['hi how are you? shall we meet up soon?']))

array([0])