In [36]:
#importing required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [37]:
#Get the spam data collection 
df=pd.read_csv('SpamCollection',sep="\t",names=["Response","Message"])

In [38]:
df.head()

Unnamed: 0,Response,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
df.describe()

Unnamed: 0,Response,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [40]:
#view response using group by and describe method
df.groupby(['Response']).describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [41]:
#Verify length of the messages and also add it as a new column 
df['Length']=df['Message'].apply(len)

In [42]:
df.head()

Unnamed: 0,Response,Message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [43]:
#define a function to get rid of stopwords present in the messages
def remove_stopwords(message):
    no_punctuation=[char for char in message if char not in string.punctuation]
    no_punctuation="".join(no_punctuation)
    return [word for word in no_punctuation.split() if word not in stopwords.words('english')]

In [44]:
#testing if the function works
df["Message"].head().apply(remove_stopwords)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, I, dont, think, goes, usf, lives, around...
Name: Message, dtype: object

In [45]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(analyzer=remove_stopwords)

In [46]:
#use bag of words by applying the function and fit the data into it
bag_of_words=vectorizer.fit(df['Message'])

In [47]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words.vocabulary_))

11617


In [48]:
#store bag of words for messages using transform method
BOW_transformed=bag_of_words.transform(df['Message'])

In [49]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer()
tfidf_tranformed=tfidf_transformer.fit(BOW_transformed)

In [54]:
#print shape of the tfidf 
message_tfidfTransformed=tfidf_tranformed.transform(BOW_transformed)
message_tfidfTransformed.shape

(5572, 11617)

In [55]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_model=MultinomialNB().fit(message_tfidfTransformed,df['Response'])

In [60]:
#check model for the predicted and expected value say for message#2 and message#5
message=df['Message'][2]
message_vectorized=bag_of_words.transform([message])
message_tfidf=tfidf_transformer.transform(message_vectorized)

In [61]:
#predicting
spam_model.predict(message_tfidf)[0]

'spam'

In [63]:
#actual
df['Response'][2]

'spam'

In [65]:
vectorized_message=vectorizer.fit(df['Message']).transform([message])

In [73]:
tfd_message=tfidf_transformer.fit(vectorized_message).transform(vectorized_message)

In [76]:
spam_model.predict(tfd_message)[0]

'spam'