# NAIVE BAYES

**File:** NaiveBayes.ipynb

**Course:** Data Science Foundations: Data Mining in Python

# IMPORT LIBRARIES

In [None]:
import matplotlib.pyplot as plt                    # For plotting data
import seaborn as sns                              # For plotting data
import pandas as pd                                # For dataframes
from sklearn.naive_bayes import GaussianNB         # For naive Bayes classifier
from sklearn.metrics import plot_confusion_matrix  # Evaluation measures

# LOAD AND PREPARE DATA
Load the training data `trn` and testing data `tst` from the CSV files in the data directory. Separate the data matrix from the class variable.

In [None]:
# Imports the training data
trn = pd.read_csv('data/spambase_trn.csv')

# Separates the attributes X0-X56 into X_trn
X_trn = trn.filter(regex='\d')

# Separates the class variable into y_trn
y_trn = trn.y

# Imports the testing data
tst = pd.read_csv('data/spambase_tst.csv')

# Separates the attributes X0-X56 into X_tst
X_tst = tst.filter(regex='\d')

# Separates the class variable into y_tst
y_tst = tst.y

# Class labels
spam = ['Not Spam','Spam']

Look at the first few rows of the training data.

In [None]:
trn.head()

# NAIVE BAYES: TRAIN MODEL
The code below creates a `GaussianNB` object to classify spam vs. not spam using the naive Bayes algorithm. 

In [None]:
nb = GaussianNB() \
    .fit(X_trn, y_trn)

## Calculate Mean Accuracy on Training Data

In [None]:
print(
    'Accuracy on training data: ' 
    + str("{:.2%}".format(nb.score(X_trn, y_trn))))

# TEST MODEL
In this phase, we test the naive Bayes model on the test set `tst`. A good evaluation measure is the `confusion matrix` that gives the fraction of true positives, true negatives, false positives, and false negatives.

## Visualize the Confusion Matrix

Normalize the scores to display as proportions across rows.

In [None]:
plot_confusion_matrix(
    nb, X_tst, y_tst,
    display_labels=spam,
    normalize='true')

## Calculate Mean Accuracy on Testing Data

In [None]:
print(
    'Accuracy on testing data: ' 
    + str("{:.2%}".format(nb.score(X_tst, y_tst))))

# CLEAN UP

- If desired, clear the results with Cell > All Output > Clear. 
- Save your work by selecting File > Save and Checkpoint.
- Shut down the Python kernel and close the file by selecting File > Close and Halt.