In [8]:
from numpy import *

In [39]:
def classify(vec_to_classify, p0vec, p1vec, pclass1):
    p1 = sum(vec_to_classify*p1vec) + log(pclass1)
    p0 = sum(vec_to_classify*p0vec) + log(1.0 - pclass1)
    if p1 > p0:
        return 1
    return 0

In [40]:
def trainNaiveBayes(training_mat, labels):
    num_docs = len(training_mat)
    num_words = len(training_mat[0])
    p_abusive = sum(labels)/float(num_docs)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0denom = 2.0
    p1denom = 2.0
    for i in range(num_docs):
        if labels[i] == 1:
            p1_num += training_mat[i]
            p1denom += sum(training_mat[i])
        else:
            p0_num += training_mat[i]
            p0denom += sum(training_mat[i])
    p1vect = log(p1_num/p1denom)
    p0vect = log(p0_num/p0denom)
    return p0vect, p1vect, p_abusive
            

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems','help', 'please'],
                  ['maybe','not','take','him','to','park','stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    labels = [0,1,0,1,0,1]
    return postingList, labels

In [2]:
def createVocabList(dataset):
    vocab = set([])
    for document in dataset:
        vocab = vocab | set(document)
    return list(vocab)

In [3]:
def wordsToVec(vocab, inputset):
    vector = [0]*len(vocab)
    for word in inputset:
        if word in vocab:
            vector[vocab.index(word)] = 1
        else:
            print("Word not fount in vocabulary: ", word)
    return vector

In [4]:
posts, labels = loadDataSet()

In [5]:
my_vocab = createVocabList(posts)

In [6]:
my_vocab

['help',
 'my',
 'mr',
 'food',
 'buying',
 'stupid',
 'posting',
 'licks',
 'problems',
 'park',
 'has',
 'maybe',
 'garbage',
 'how',
 'worthless',
 'cute',
 'ate',
 'steak',
 'take',
 'dog',
 'so',
 'I',
 'to',
 'love',
 'please',
 'is',
 'quit',
 'stop',
 'dalmation',
 'flea',
 'not',
 'him']

In [7]:
wordsToVec(my_vocab,posts[0])

[1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0]

In [10]:
train_mat = []
for i in posts:
    train_mat.append(wordsToVec(my_vocab,i))

In [11]:
train_mat

[[1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0],
 [0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0]]

In [35]:
p0V, p1V, pAb = trainNaiveBayes(train_mat,labels)

In [36]:
p0V

array([0.07692308, 0.15384615, 0.07692308, 0.03846154, 0.03846154,
       0.03846154, 0.03846154, 0.07692308, 0.07692308, 0.03846154,
       0.07692308, 0.03846154, 0.03846154, 0.07692308, 0.03846154,
       0.07692308, 0.07692308, 0.07692308, 0.03846154, 0.07692308,
       0.07692308, 0.07692308, 0.07692308, 0.07692308, 0.07692308,
       0.07692308, 0.03846154, 0.07692308, 0.07692308, 0.07692308,
       0.03846154, 0.11538462])

In [37]:
pAb

0.5

In [21]:
labels

[0, 1, 0, 1, 0, 1]

In [38]:
p1V

array([0.05, 0.05, 0.05, 0.1 , 0.1 , 0.2 , 0.1 , 0.05, 0.05, 0.1 , 0.05,
       0.1 , 0.1 , 0.05, 0.15, 0.05, 0.05, 0.05, 0.1 , 0.1 , 0.05, 0.05,
       0.1 , 0.05, 0.05, 0.05, 0.1 , 0.1 , 0.05, 0.05, 0.1 , 0.1 ])