##### Find Span emails using Naive Byes

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./spam.csv')
df.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head(4)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0


In [5]:
df.drop(['Category'],axis='columns',inplace=True)
df.head(4)

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0


In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train,Y_test = train_test_split(df.Message,df.spam)

#### Message is not numeric and computer understands numbers

So we will convert the words using CountVectorizer()
which will store the occurence of each word

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

There are multiple types of Naive Byes
We will use the Multinomial one, because we have multiple words
![image](naivebyes.jpg)

In [8]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}

In [9]:
model.fit(X_train_count,Y_train)

In [10]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [12]:
X_test_count = v.transform(X_test)
model.score(X_test_count, Y_test)

0.9856424982053122

### Here we see the model is working fine
But we have to convert it again and again using Couter Vectorizer,
We can fix that using pipelining

In [13]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('Vectorizer', CountVectorizer()),
    ('nb',MultinomialNB())  
])

In [14]:
clf.fit(X_train,Y_train)

In [15]:
clf.score(X_test,Y_test)

0.9856424982053122