<center>
    <h1><b>Naive Bayes- Email Spam Prediction</b></h1>
    -------------------------------------
</center>

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

%matplotlib inline

In [15]:
data = pd.read_csv(r'D:\AI Engineering\Python\My_Projects\Datasets\spam.csv', usecols = ['v1', 'v2'], encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
# renaming the columns
df = data.rename(columns = {'v1':'category', 'v2':'message'})
df.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
df.groupby('category').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [23]:
# converting category values to numbers
df['spam']= df['category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,category,message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['spam'], test_size = 0.25, random_state = 0)

 ### Using Vectorization to convert unique values in message columns to numbers

In [27]:
vector = CountVectorizer()
X_train_count = vector.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
# to fetch out the unique names in message column
corpus = vector.get_feature_names_out()[1000:1010]
corpus

array(['arnt', 'around', 'aroundn', 'arrange', 'arranging', 'arrested',
       'arrival', 'arrive', 'arrived', 'arrow'], dtype=object)

### Building the Model

In [41]:
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [43]:
# Quick prediction of emails
emails = [
     "Hey mohan, can we get together to watch football game tomorrow?",
     "Upto 20% discount on parking, exclusive offer just for you. Don't miss this reward!"
 ]

emails_count = vector.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [46]:
# Evaluating the model accuracy
X_test_count = vector.transform(X_test)
f'{(model.score(X_test_count, y_test) * 100).round(2)}%'

'98.64%'

## Using Pipeline API to Build Model

In [48]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()), ('nb', MultinomialNB())
])

In [49]:
# Building the model
clf.fit(X_train, y_train)

In [56]:
# Model evaluation
f'Model Accuracy: {(clf.score(X_test, y_test) * 100).round(2)}%'

'Model Accuracy: 98.64%'

In [57]:
# prediction
clf.predict(emails)

array([0, 1], dtype=int64)