## Spam Detection using Naive Bayes:

####Loading the data:

In [30]:
import pandas as pd

In [31]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [33]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


#### Preprocessing the data:

In [34]:
df['spam'] = df['Category'].apply(lambda x:1 if x =='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [35]:
X = df.Message
y = df.spam

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X,y, test_size = 0.2,
                                                    stratify = y, random_state = 0)

####Vectorizing the text data:

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

In [38]:
X_train_vectorized = v.fit_transform(X_train.values)

In [39]:
X_train_vectorized.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

####Model Builiding:

In [40]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [41]:
model = MultinomialNB()

In [42]:
model.fit(X_train_vectorized, y_train)

In [43]:
X_test_vectorized = v.transform(X_test)

In [46]:
model.score(X_test_vectorized, y_test)

0.9775784753363229

In [44]:
emails = [
    'Hey John, can we watch football today?',
    'Upto 30% discount on store purchase just for you. Dont miss this reward!'
]

In [45]:
emails_vectorized = v.transform(emails)

In [47]:
model.predict(emails_vectorized)

array([0, 1])

####Pipeline:

In [48]:
from sklearn.pipeline import Pipeline

In [49]:
pipeline_model = Pipeline( [
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [51]:
pipeline_model.fit(X_train,y_train)

In [52]:
pipeline_model.score(X_test,y_test)

0.9775784753363229

In [53]:
pipeline_model.predict(emails)

array([0, 1])