In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np


In [103]:
data_frame = pd.read_csv(r"C:\Users\DIU\Desktop\spambase.csv") # read the .csv file from PC

In [104]:
data_frame.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,;,(,[,!,$,#,length_average,length_longest,lenth_total,spam or not
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [105]:
# according to the documentation the first 48 columns are continuous real [0,100] attributes of type word_freq_WORD 
# so we only use first 48 columns amongs all rows 
# this 48 rows are the features 


# first argument denotes rows range, and second for the columns range
# here i select all the rows by providing nothing, and select 0 to 48 columns

features = data_frame.iloc[: , 0:48] 

> #### Read the Documentation from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION)

In [107]:
features.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,pm,direct,cs,meeting,original,project,re,edu,table,conference
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
# according to the documentation the last columns is for the label contains whether a email spam or not
# represent as binary format, 0 denotes 'not spam', 1 denotes 'spam'

labels = data_frame.iloc[:, -1] # selecting only the last column among all the rows

In [116]:
labels.head()

0    1
1    1
2    1
3    1
4    1
Name: spam or not, dtype: int64

In [117]:
labels.tail()

4596    0
4597    0
4598    0
4599    0
4600    0
Name: spam or not, dtype: int64

In [233]:
split_size = .33 # means 33% test data and 67% will be train data

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = split_size, random_state=17)

In [234]:
print ("Number of trainning data : ", len(features_train))

print("Percentage of training data: ",len(features_train) * 100 / len(features), "%")

print ("Number of trainning data : ", len(labels_test))

print("Percentage of training data: ",len(labels_test) * 100 / len(features),"%")

Number of trainning data :  3082
Percentage of training data:  66.98543794827212 %
Number of trainning data :  1519
Percentage of training data:  33.014562051727886 %


## Multinomial Naive bayes Classifier

In [235]:
mlnNB = MultinomialNB()

mlnNB.fit(features_train ,labels_train)

pred_on_test_data = mlnNB.predict(features_test)

acc_score = accuracy_score(pred_on_test_data, labels_test)

print ("Accuracy score in percentage : " , acc_score, "%")


Accuracy score in percentage :  0.8736010533245556 %


### Quite good accuracy ! But it needs to improve

>  split_size = .10

In [236]:
# firstly trying to change the split size into 10, so lets see what happens

split_size = .10 # means 33% test data and 67% will be train data

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = split_size, random_state=17)

mlnNB.fit(features_train ,labels_train)

pred_on_test_data = mlnNB.predict(features_test)

mln_pred = pred_on_test_data # 'mln_pred' will be use letter for performance measure

acc_score = accuracy_score(pred_on_test_data, labels_test)

print ("Accuracy score in percentage : " , acc_score, "%")

print (len(labels_test))

Accuracy score in percentage :  0.89587852494577 %
461


### Great result! Lets try another classifier:

## Bernoulli Naive Bayes Classifier

In [242]:
# Now lets try with another classifier which is Bernoulli Naive Bayes

bnlNB = BernoulliNB(alpha = 1.0) # alpha '1' means smooth '0' means no smoothing

bnlNB.fit(features_train, labels_train)

pred_from_test_data = bnlNB.predict(features_test)

bln_pred = pred_from_test_data # 'bln_pred' will be use letter for performance measure

score = accuracy_score(pred_from_test_data, labels_test)


In [243]:
print ("Accuracy score in Bernoulli Classifier : ", score, "%")

Accuracy score in Bernoulli Classifier :  0.911062906724512 %


### Wow ! That's Cool..

In [246]:
# Now lets try with another classifier which is Gaussian Naive Bayes

gNB = GaussianNB() # alpha '1' means smooth '0' means no smoothing

gNB.fit(features_train, labels_train)

pred_from_test_data = gNB.predict(features_test)

g_pred = pred_from_test_data # 'g_pred' will be use letter for performance measure

score = accuracy_score(pred_from_test_data, labels_test)

In [247]:
print ("Accuracy score in Gaussian Classifier : ", score, "%")

Accuracy score in Gaussian Classifier :  0.8459869848156182 %


## Accuracy Summary:

* Multinomial Naive Bayes Classifier Accuracy: **0.89587852494577%**
* Bernoulli Naive Bayes Classifier Accuracy: **0.911062906724512%**
* Gaussian Naive Bayes Classifier Accuracy: **0.8459869848156182%**

## Perfromance Testing of Multinomial Naive Bayes Classifier

In [240]:
# now i need to test the performance of this Model

# first lets check the confusion metrix 

from sklearn.metrics import confusion_matrix
print ("Total test records: ", len(labels_test))
confusion_matrix(mln_pred, labels_test, labels=[1,0])

Total test records:  461


array([[193,  43],
       [  5, 220]], dtype=int64)

### Confusion Matrix result:
* True Positive : **193**
* False Positive : **43**
* False Negative: **5**
* True Negative : **220**



In [241]:
from sklearn.metrics import classification_report

print (classification_report(mln_pred, labels_test, labels=[1,0]))

             precision    recall  f1-score   support

          1       0.97      0.82      0.89       236
          0       0.84      0.98      0.90       225

avg / total       0.91      0.90      0.90       461



## Classification Report:

* precision : **0.91**
* recall : **0.90**

## Perfromance Testing of Bernoulli Naive Bayes Classifier

In [244]:
from sklearn.metrics import confusion_matrix
print (len(labels_test))

confusion_matrix(bln_pred, labels_test, labels=[1,0])

461


array([[172,  15],
       [ 26, 248]], dtype=int64)

### Confusion Matrix result:
* True Positive : **172**
* False Positive : **15**
* False Negative: **26**
* True Negative : **248**

In [245]:
from sklearn.metrics import classification_report

print (classification_report(bln_pred, labels_test, labels=[1,0]))

             precision    recall  f1-score   support

          1       0.87      0.92      0.89       187
          0       0.94      0.91      0.92       274

avg / total       0.91      0.91      0.91       461



## Classification Report:

* precision : **0.91**
* recall : **0.91**

## Perfromance Testing of Gaussian Naive Bayes Classifier

In [248]:
from sklearn.metrics import confusion_matrix
print (len(labels_test))

confusion_matrix(g_pred, labels_test, labels=[1,0])

461


array([[195,  68],
       [  3, 195]], dtype=int64)

### Confusion Matrix result:
* True Positive : **195**
* False Positive : **68**
* False Negative: **3**
* True Negative : **195**

In [249]:
from sklearn.metrics import classification_report

print (classification_report(g_pred, labels_test, labels=[1,0]))

             precision    recall  f1-score   support

          1       0.98      0.74      0.85       263
          0       0.74      0.98      0.85       198

avg / total       0.88      0.85      0.85       461



## Classification Report:

* precision : **0.88**
* recall : **0.85**

# Accuracy and performance(Confusion Matrix and Classification Report) Summary

### Accuracy:

* Multinomial Naive Bayes Classifier Accuracy: **0.89587852494577%**
* Bernoulli Naive Bayes Classifier Accuracy: **0.911062906724512%**
* Gaussian Naive Bayes Classifier Accuracy: **0.8459869848156182%**

### Performance:
##### Multinomial Naive Bayes
* Total truthy result : **413**
* Total falsy result : **48** 

##### Burnoulli Naive Bayes
* Total truthy result : **420**
* Total falsy result : **41**

##### Gaussian Naive Bayes
* Total truthy result : **390**
* Total falsy result : **71**


### Classification Report:
##### Multinomial Naive Bayes
* precision : **0.91**
* recall : **0.90**

##### Burnoulli Naive Bayes
* precision : **0.91**
* recall : **0.91**

##### Gaussian Naive Bayes
* precision : **0.88**
* recall : **0.85**






### So according to the result summary i can conclude that Burnoulli Naive Bayes Classifier will gives the optimal and best result