# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [40]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import numpy as np

# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight


## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [42]:
from likelihood import likelihood
# TODO
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

# begin answer
# 打开word_map并读取所有的word和数字对应的映射
word_map = {}
for line in open("all_word_map.txt", "r"):
    word, num = line.split('\t')
    word_map[int(num) - 1] = word

In [43]:
# 计算ham和spam的四然，然后对spam中出现频率最高的进行排序并有argsort获取其对应的下标
l = likelihood(x)
ratio = l[1] / l[0]
sorted_ratio = np.argsort(ratio)[::-1][0: 10]
top_ten_word = []
for i in sorted_ratio:
    top_ten_word.append(word_map[i])
print(top_ten_word)

['nbsp', 'viagra', 'pills', 'cialis', 'voip', 'php', 'meds', 'computron', 'sex', 'ooking']


In [47]:
# 计算先验概率和似然的对数值
total = num_ham_train + num_spam_train
prior_ham = num_ham_train / total
prior_spam = num_spam_train / total
prior = np.log(np.array([prior_ham, prior_spam]))
l_log = np.log(l)
print(prior)
print(l_log)

[-0.31718499 -1.30267419]
[[-13.53418866 -13.93965377 -14.63280095 ... -13.93965377 -12.33021586
  -12.84104148]
 [-13.39939493 -13.39939493 -12.70624775 ... -13.39939493 -11.45348478
  -13.39939493]]


In [57]:
# 计算后验概率
# 这里的ham_test和spam_test是存储了测试集中每个邮件的单词出现频率的稀疏矩阵
ham_post = ham_test * l_log.T + prior
ham_miss = np.sum(ham_post[:, 0] < ham_post[:, 1])
ham_right = ham_post.shape[0] - ham_miss
print(ham_miss)
spam_post = spam_test * l_log.T + prior
spam_miss = np.sum(spam_post[:, 1] < spam_post[:, 0])
spam_right = spam_post.shape[0] - spam_miss
print(spam_miss)
TP = spam_right
FN = ham_miss
FP = spam_miss
TN = ham_right
P = TP / (TP + FP)
R = TP / (TP + FN)
print(P)
print(R)

28
31
0.9724199288256228
0.9750223015165032
