In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
names=['Label',"Message"]
data=pd.read_csv(r"SMSSpamCollection.csv",sep="\t",names=names)
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.describe()

Unnamed: 0,Label,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
data.groupby("Label").count()

Unnamed: 0_level_0,Message
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [7]:
data.duplicated().count()

5572

In [8]:
X=data.drop_duplicates()


In [9]:
X.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
X.shape


(5169, 2)

In [11]:
X.groupby('Label').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4516,4516,Hello which the site to download songs its urg...,1
spam,653,653,Sorry! U can not unsubscribe yet. THE MOB offe...,1


In [12]:
X['length']=X['Message'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
X.head()

Unnamed: 0,Label,Message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [14]:
X.shape

(5169, 3)

In [15]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [16]:
temp=[char for char in X['Message'][2] if char not in string.punctuation]
temp="".join(temp)
temp

'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s'

In [17]:
def clean(X):
    temp=[char for char in X if char not in string.punctuation]
    temp="".join(temp)
    temp=temp.lower()
    temp=temp.split()
    temp=[ps.stem(word) for word in temp if word not in stopwords.words("english")]
    temp=" ".join(temp)
    return temp

In [18]:
X['Message']=X['Message'].apply(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
X.head()

Unnamed: 0,Label,Message,length
0,ham,go jurong point crazi avail bugi n great world...,111
1,ham,ok lar joke wif u oni,29
2,spam,free entri 2 wkli comp win fa cup final tkt 21...,155
3,ham,u dun say earli hor u c alreadi say,49
4,ham,nah dont think goe usf live around though,61


In [38]:
X['Message'][2]

'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18'

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [22]:
X_count=cv.fit_transform(X['Message'])
X_count.shape

(5169, 8098)

In [23]:
X_count=X_count.toarray()

In [49]:
print(cv.vocabulary_)

{'go': 3315, 'jurong': 4097, 'point': 5587, 'crazi': 2234, 'avail': 1335, 'bugi': 1739, 'great': 3406, 'world': 7854, 'la': 4241, 'buffet': 1737, 'cine': 2016, 'got': 3368, 'amor': 1146, 'wat': 7650, 'ok': 5215, 'lar': 4276, 'joke': 4063, 'wif': 7766, 'oni': 5245, 'free': 3129, 'entri': 2785, 'wkli': 7813, 'comp': 2112, 'win': 7779, 'fa': 2903, 'cup': 2281, 'final': 3005, 'tkt': 7165, '21st': 452, 'may': 4637, '2005': 439, 'text': 7024, '87121': 869, 'receiv': 5912, 'questionstd': 5824, 'txt': 7347, 'ratetc': 5870, 'appli': 1218, '08452810075over18': 71, 'dun': 2661, 'say': 6185, 'earli': 2677, 'hor': 3692, 'alreadi': 1118, 'nah': 4927, 'dont': 2582, 'think': 7092, 'goe': 3327, 'usf': 7476, 'live': 4396, 'around': 1257, 'though': 7108, 'freemsg': 3137, 'hey': 3601, 'darl': 2336, 'week': 7689, 'word': 7843, 'back': 1379, 'id': 3797, 'like': 4365, 'fun': 3193, 'still': 6731, 'tb': 6966, 'xxx': 7972, 'std': 6708, 'chg': 1965, 'send': 6258, '150': 351, 'rcv': 5881, 'even': 2832, 'brother':

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
tf=TfidfTransformer()

In [26]:
X_tf=tf.fit_transform(X_count)
X_tf.shape

(5169, 8098)

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_tf,X['Label'])

In [28]:
clf.score(X_tf,X['Label'])

0.975817372799381

In [29]:
prediction=clf.predict(X_tf)


In [30]:
clf.score(X_tf,prediction)

1.0

In [82]:
new_doc=['Free free entri 2 wkli comp win pizza ']
new_cv=cv.transform(new_doc)

In [83]:
new_tf=tf.transform(new_cv)
prediction=clf.predict(new_tf)

In [84]:
prediction

array(['spam'], dtype='<U4')