# Email Spam Detector using Naive Bayes Algorithm 

In [1]:
import pandas as pd
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


so here we have 4825 number of good email and 747 spam email 

### now the category column has spam and ham as string , so we need to convert it to integer as 0 and 1

In [4]:
## we can use apply function and also we can do it by using dummy variable 
df['spam'] = df['Category'].apply(lambda x : 1 if x =='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### now split the datasets 

In [5]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(df.Message , df.spam , test_size = 0.2)

### Using CountVectorizer for converting the message into numbers 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(2, 7792))

**Training our model**

In [7]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [9]:
model.fit(X_train_count, y_train)

In [12]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails) ## converting the email messages to numbers
model.predict(emails_count)

array([0, 1])

In [13]:
X_test_count = v.transform(X_test) ## converting the X_test to numbers
model.score(X_test_count, y_test)

0.9838565022421525

### using the sklearn pipline to simplify the code

# sklearn Pipeline

In [15]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()), 
    ('nb', MultinomialNB())
]) 

## this will convert the message into count and apply to the multinomial naive bayes

In [17]:
clf.fit(X_train, y_train) ## we can use X_train and y_train directly here as clf will do all the work needed 

In [18]:
## checking the accuracy 
clf.score(X_test, y_test)

0.9838565022421525

In [19]:
clf.predict(emails)

array([0, 1])