# Spam Classification Using Bernoulli Naive Bayes Classifier

In [93]:
# Importing Necessary Packages:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection  import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

In [14]:
# Importing Dataset
data = pd.read_csv("spambase.data", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [26]:
# The documentation says that only first 48 columns are continuous real [0,100] attributes of type word_freq_WORD
# Hence we will be only keeping those columns for training our model and will be discarding the rest.
features = data.iloc[:, 0:48]
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Also, as per the documentation, the last column denotes denotes whether the e-mail was considered spam (1) or not (0).
# Creating a y_class variable for the same by slicing just the last column of all the rows.
# Please Note that integer value of Spam is 1 and Ham is 0; as mentioned above.

labels = data.iloc[:, -1]
labels.head()

0    1
1    1
2    1
3    1
4    1
Name: 57, dtype: int64

In [45]:
# Calculating the percentage of Spam in the dataset:
ham_spam=labels.value_counts()
print(ham_spam, "\n")
print("Spam % is ",(ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100)

0    2788
1    1813
Name: 57, dtype: int64 

Spam % is  39.404477287546186


### Bernoulli Naive Bayes Classifier


In [190]:
## Performing k-fold(7 folds) classification using Bernoulli Naive Bayes Classifier
kf = KFold(n_splits=7) 

## Creating an empty list for storing the perfromance metrics for each fold
lst = []

## For Each fold, we will train the model, predict the results and will record the performance metrics.
for train_index, test_index in kf.split(data):
    X_train = features.iloc[train_index, :]
    Y_train = labels[train_index]
    X_test = features.iloc[test_index, :]
    Y_test = labels[test_index]
    
    #Initialising Bernoulli NB Classifier
    bnb = BernoulliNB()
    
    # fit
    bnb.fit(X_train,Y_train)

    # predict class
    y_pred_class = bnb.predict(X_test)

    # Accuracy
    accuracy = metrics.accuracy_score(Y_test, y_pred_class)
    accuracy = round(accuracy*100,2)
    
    # Error
    error = metrics.mean_absolute_error(Y_test, y_pred_class)
    error = round(error*100,2)
    
    confusion = metrics.confusion_matrix(Y_test, y_pred_class)
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    TP = confusion[1, 1]
    
    lst.append([TP,FP,TN,FN,accuracy, error])
    
## Column headers for each performance metrics
cols = ["True Positive", "False Positive", "True Negative", "False Negative", "Accuracy", "Error"]

## Creating the kfold data table which will have all the necessary performance metrics information for each fold.
kfold_table = pd.DataFrame(lst, columns=cols)
kfold_table

Unnamed: 0,True Positive,False Positive,True Negative,False Negative,Accuracy,Error
0,509,0,0,149,77.36,22.64
1,554,0,0,104,84.19,15.81
2,369,7,153,128,79.45,20.55
3,0,32,625,0,95.13,4.87
4,0,43,614,0,93.46,6.54
5,0,29,628,0,95.59,4.41
6,0,151,506,0,77.02,22.98


In [188]:
averageAccuracy = kfold_table.loc[:, "Accuracy"].mean()
print("Average Accuracy of the Model is:", averageAccuracy)

Average Accuracy of the Model is: 86.02857142857142


In [189]:
averageErrors = kfold_table.loc[:, "Error"].mean()
print("Average Error of the Model is:", averageErrors)

Average Error of the Model is: 13.971428571428572


Thus, the metrics indicates that model can precisely classify spam emails with maximum accuracy of around 95% and an average accuracy of around 86%.

## THANK YOU.