# 算法实现

In [18]:
import numpy as np


In [1]:
def load_dataset():
    """post and label"""
    posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vector = [0, 1, 0, 1, 0, 1]  #1 is abusive, 0 not
    return posting_list, class_vector

In [12]:
def create_vocabulary_list(dataset):
    vocabulary = set([])
    for document in dataset:
        vocabulary = vocabulary | set(document)
    return list(vocabulary)

In [15]:
def word_to_vector(vocab_list, input_set):
    return_vector = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vector[vocab_list.index(word)] = 1
    return return_vector

In [16]:
list_of_posts, list_classes = load_dataset()
vocabulary = create_vocabulary_list(list_of_posts)
word_to_vector(vocabulary, list_of_posts[0])

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0]

In [31]:
def train_naive_bayes(train_matrix, train_category):
    number_of_docs = len(train_matrix)
    number_of_words = len(train_matrix[0])
    p_abusive = np.sum(train_category) / number_of_docs
    p0_num, p1_num = np.ones(number_of_words), np.ones(number_of_words)  # 每个单词出现的次数,避免某个单词出现次数为零
    p0_de_num, p1_de_num = 2, 2
    for i in range(number_of_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_de_num += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_de_num += sum(train_matrix[i])
    p0_vec = np.log(p0_num / p0_de_num)
    p1_vec = np.log(p1_num / p1_de_num)
    return p0_vec, p1_vec, p_abusive

In [26]:
train_matrix = []
for docs in list_of_posts:
    train_matrix.append(word_to_vector(vocabulary, docs))

In [27]:
p0_vec, p1_vec, p_abusive = train_naive_bayes(train_matrix, list_classes)

In [28]:
p_abusive

0.5

In [32]:
def classify_naive_bayes(vector, p0_vector, p1_vector, p_abusive):
    p0 = np.sum(vector * p0_vector) + np.log(p_abusive)
    p1 = np.sum(vector * p1_vector) + np.log(1 - p_abusive)
    if p0 > p1:
        return 0
    else:
        return 1

In [37]:
def test():
    list_of_posts, list_classes = load_dataset()
    vocabulary = create_vocabulary_list(list_of_posts)
    train_matrix = []
    for post in list_of_posts:
        train_matrix.append(word_to_vector(vocabulary, post))
    p0_vec, p1_vec, p_abusive = train_naive_bayes(train_matrix, list_classes)
    test_input = ['stupid', 'garbage']
    test_vector = np.array(word_to_vector(vocabulary, test_input))
    return classify_naive_bayes(test_vector, p0_vec, p1_vec, p_abusive)


In [38]:
test()

1