# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [1]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',').astype(np.int)
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',').astype(np.int)
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T.astype(np.int)
# print(i, j, ham_test)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight




## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [18]:
# question 1
p_spam = np.zeros((len(spam_train),2))
for i in range(len(spam_train)):
    p_spam[i,:] = [spam_train[i] + 1, 3372 + 2]
    
p_ham = np.zeros((len(ham_train),2))
for i in range(len(ham_train)):
    p_ham[i,:] = [ham_train[i] + 1, 9034 + 2]
  
    
ask1 = np.zeros((len(ham_train)))
for i in range(len(ham_train)):
    ask1[i] = np.log(p_spam[i][0]) + np.log(p_ham[i][1]) - np.log(p_spam[i][1]) - np.log(p_ham[i][0])
ask1.argsort()[:10]

array([52913, 27709,  2571, 23471, 23631,  8058, 22602, 33609, 35796,
       60120])

In [31]:
# question 2 question 4
from likelihood import likelihood
l = likelihood(x)
prior = np.array([num_ham_train, num_spam_train]) / (num_ham_train + num_spam_train)
result = ham_test * l.T + np.tile(np.log(prior), (ham_test.shape[0], 1))
ham_result = result[:, 0] < result[:, 1]

result2 = spam_test * l.T + np.tile(np.log(prior), (spam_test.shape[0], 1))
spam_result = result2[:, 0] < result2[:, 1]

fp = np.sum(ham_result)
tp = np.sum(spam_result)
fn = spam_test.shape[0] - np.sum(spam_result)
tn = ham_test.shape[0] - np.sum(ham_result)


In [33]:
print( (tp + tn) / (tp + tn + fp + fn) )
print(tp / (tp + fp))
print(tp / (tp + fn))

0.9857315598548972
0.9750223015165032
0.9724199288256228
