# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [3]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [39]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')

print(ham_train, ham_train.shape)
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
print(ham_test.shape)
# 按照(i-1, j-1)的坐标来填充，(i-1,j-1)代表第i个邮件的词库第j个单词的记录位置
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))

# 比如训练邮件里只有前300-5000个词而词库包含8000个词，后3000个0也需要加到array中
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight


# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight


[2. 1. 0. ... 1. 9. 5.] (77386,)
(353368,)




## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [5]:
from likelihood import likelihood
# TODO
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

# begin answer
# end answer

(array([2., 1., 0., ..., 1., 9., 5.]), array([0., 0., 1., ..., 0., 6., 0.]))

In [87]:
spam_word_frequency = spam_train / np.sum(spam_train)
ham_word_frequency = ham_train / np.sum(ham_train)

ration = spam_word_frequency/ (ham_word_frequency + 0.001)
top_10_spam_word_index = ration.argsort()[::-1][:10]

In [30]:
import re
f = open("all_word_map.txt", "r", encoding='utf-8')
dic = {}
for line in f.readlines():
    word, index = re.split(re.compile(r'\s+'), line.strip())
    dic[int(index)] = word

for index in top_10_spam_word_index:
    print(dic[index])

yous
xjynw
consulting
sunspot
yov
ous
hajia
apices
retreated
convoluted


In [85]:
ham_prior = num_ham_train / (num_ham_train + num_spam_train)
spam_prior = num_spam_train / (num_ham_train + num_spam_train)

In [77]:
ham_log_likelihood = np.log(ham_word_frequency)
spam_log_likelihood = np.log(spam_word_frequency)

In [81]:
ham_log_likelihood = ham_log_likelihood.reshape((-1,1))
spam_log_likelihood = spam_log_likelihood.reshape((-1,1))
# ham_test.shape, ham_log_likelihood.shape, spam_test.shape, spam_log_likelihood.shape
ham_test_post1 = np.matmul(ham_test.todense(), ham_log_likelihood) + np.log(ham_prior)
ham_test_post2 = np.matmul(ham_test.todense(), spam_log_likelihood) + np.log(spam_prior)
spam_test_post1 = np.matmul(spam_test.todense(), spam_log_likelihood) + np.log(spam_prior)
spam_test_post2 = np.matmul(spam_test.todense(), ham_log_likelihood) + np.log(ham_prior)

In [86]:
correct_ham_sum = ((ham_test_post1 > ham_test_post2) == True).sum()
correct_spam_sum = ((spam_test_post1 > spam_test_post2) == True).sum()

accuracy = (correct_ham_sum + correct_spam_sum) / (ham_test.shape[0] + spam_test.shape[0])
print(accuracy)

0.9869407496977025


In [91]:
confus_matrix = np.zeros((2,2))
confus_matrix[0,0] = tp = correct_spam_sum
confus_matrix[1,0] = fn = spam_test.shape[0] - correct_spam_sum

confus_matrix[0,1] = fp = ham_test.shape[0] - correct_ham_sum
confus_matrix[1,1] = tn = correct_ham_sum
print(confus_matrix)

[[1089.   19.]
 [  35. 2992.]]


In [92]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(precision, recall)

0.9828519855595668 0.9688612099644128
