<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/yykim/Naive_Bayes_Classifier_%EA%B5%AC%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Naive Bayes Classifier 구현
##### Spam/Normal 분류 예시



In [None]:
import numpy as np

In [None]:
m1 = 'me free lottery'
m2 = 'free get free you'
m3 = 'you free scholarhip'
m4 = 'free to contact me'
m5 = 'you won award'
m6 = 'you ticket lottery'
labels = ['spam', 'spam', 'normal', 'normal', 'normal', 'spam']
mails = [m1, m2, m3, m4, m5, m6]

In [None]:
#토크나이징
for i, mail in enumerate(mails):
    mails[i] = mail.split(" ")
mails

[['me', 'free', 'lottery'],
 ['free', 'get', 'free', 'you'],
 ['you', 'free', 'scholarhip'],
 ['free', 'to', 'contact', 'me'],
 ['you', 'won', 'award'],
 ['you', 'ticket', 'lottery']]

In [None]:
#사전확률 P(spam), P(normal) 구하기
p_spam = labels.count('spam')/len(labels)
p_normal = 1-p_spam
print(p_spam, p_normal)

0.5 0.5


In [None]:
#spam과 normal 메일 구분하여 토큰 분류 
spam_mails = []
normal_mails = []
for i, mail in enumerate(mails):
    if labels[i] == 'spam':
        spam_mails.append(mail)
    else:
        normal_mails.append(mail)

print(spam_mails)
print(normal_mails)

[['me', 'free', 'lottery'], ['free', 'get', 'free', 'you'], ['you', 'ticket', 'lottery']]
[['you', 'free', 'scholarhip'], ['free', 'to', 'contact', 'me'], ['you', 'won', 'award']]


In [None]:
#토큰 개수 세기 
total_token = []
for mail in mails:
    for token in mail:
        total_token.append(token)
unique_token = np.unique(total_token)

spam_total_token = []
for mail in spam_mails:
    for token in mail:
        spam_total_token.append(token)
spam_total_token

normal_total_token = []
for mail in normal_mails:
    for token in mail:
        normal_total_token.append(token)

print(unique_token)
print(spam_total_token)
print(normal_total_token)

['award' 'contact' 'free' 'get' 'lottery' 'me' 'scholarhip' 'ticket' 'to'
 'won' 'you']
['me', 'free', 'lottery', 'free', 'get', 'free', 'you', 'you', 'ticket', 'lottery']
['you', 'free', 'scholarhip', 'free', 'to', 'contact', 'me', 'you', 'won', 'award']


In [None]:
#각각의 메일에서 토큰이 나온 횟수세기
spam_n_unique_token=[]
for token in unique_token:
    tmp = spam_total_token.count(token)
    spam_n_unique_token.append(tmp)

normal_n_unique_token=[]
for token in unique_token:
    tmp = normal_total_token.count(token)
    normal_n_unique_token.append(tmp)


In [None]:
#laplace smoothing위해 1과 2를 더해줌 
p_token_in_spam = (np.array(spam_n_unique_token)+0.5)/(sum(np.array(spam_n_unique_token))+1)
p_token_in_normal = (np.array(normal_n_unique_token)+0.5)/(sum(np.array(normal_n_unique_token))+1)
p_token_in_spam


array([0.04545455, 0.04545455, 0.31818182, 0.13636364, 0.22727273,
       0.13636364, 0.04545455, 0.13636364, 0.04545455, 0.04545455,
       0.22727273])

In [None]:
spam_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_spam)))
normal_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_normal)))
spam_dict_unique_token

{'award': -3.0910424533583156,
 'contact': -3.0910424533583156,
 'free': -1.1451323043030026,
 'get': -1.9924301646902063,
 'lottery': -1.4816045409242156,
 'me': -1.9924301646902063,
 'scholarhip': -3.0910424533583156,
 'ticket': -1.9924301646902063,
 'to': -3.0910424533583156,
 'won': -3.0910424533583156,
 'you': -1.4816045409242156}

In [None]:
#P(spam)*P(free|spam)*P(lottery|spam)
word_is_spam = np.log(p_spam) + spam_dict_unique_token['free'] + spam_dict_unique_token['lottery']
word_is_spam = np.exp(word_is_spam)
word_is_spam

0.03615702479338842

In [None]:
#P(normal)*P(free|normal)*P(lottery|normal)
word_is_normal = np.log(p_normal) + normal_dict_unique_token['free'] + normal_dict_unique_token['lottery']
word_is_normal = np.exp(word_is_normal)
word_is_normal

0.00516528925619835

In [None]:
#P(spam|words)
word_is_spam/(word_is_spam + word_is_normal)

0.8749999999999999

In [None]:
#P(normal|words)
word_is_normal/(word_is_spam + word_is_normal)

0.12500000000000008

### Class 만들기
- 이중분류

In [147]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self, docs = [], labels = [], words=[], k = 1):
    # docs는 토큰화한 문서들을 리스트화 한 이중 리스트
        self.docs = docs
        self.labels = labels
        self.k = k #laplace smoothing을 위한 상수 k
        self.words = words
        self.p_category1 = 0 
        self.p_category2 = 0
        self.p_c1_post = 0
        self.p_c2_post = 0
        self.category_name=[]


    def _cal_prior(self):
        #각각의 카테고리의 사전확률 계산 
        self.category_name = np.unique(labels)
        self.p_category1 = labels.count(self.category_name[0])/len(labels)
        self.p_category2 = 1-self.p_category1
        return {self.category_name[0]:self.p_category1, self.category_name[1]:self.p_category2}

    def _cal_posterior(self):
        self._cal_prior()
        #문서 분류
        docs_1 = []
        docs_2 = []
        for i, doc in enumerate(self.docs):
            if labels[i] == self.category_name[0]:
                docs_1.append(doc)
            else:
                docs_2.append(doc)        

        #단어 토큰 개수 세기 
        total_token = []
        for doc in self.docs:
            for token in doc:
                total_token.append(token)
        unique_token = np.unique(total_token)

        category1_total_token = []
        for mail in docs_1:
            for token in mail:
                category1_total_token.append(token)
    
        category2_total_token = []
        for mail in docs_2:
            for token in mail:
                category2_total_token.append(token)

        #각각의 메일에서 토큰이 나온 횟수세기
        category1_n_unique_token=[]
        for token in unique_token:
            tmp = category1_total_token.count(token)
            category1_n_unique_token.append(tmp)

        category2_n_unique_token=[]
        for token in unique_token:
            tmp = category2_total_token.count(token)
            category2_n_unique_token.append(tmp)

        p_token_in_c1 = (np.array(category1_n_unique_token)+1*self.k)/(sum(np.array(category1_n_unique_token))+2*self.k)
        p_token_in_c2 = (np.array(category2_n_unique_token)+1*self.k)/(sum(np.array(category2_n_unique_token))+2*self.k)

        c1_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_c1)))
        c2_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_c2)))

        word_is_c1 = np.log(self.p_category1)
        for word in self.words:
            word_is_c1 += c1_dict_unique_token[word] 
        word_is_c1 = np.exp(word_is_c1)

        word_is_c2 = np.log(self.p_category2) 
        for word in self.words:
            word_is_c2 += c2_dict_unique_token[word] 
        word_is_c2 = np.exp(word_is_c2)

        self.p_c1_post = word_is_c1/(word_is_c1 + word_is_c2)
        self.p_c2_post = word_is_c2/(word_is_c1 + word_is_c2)
        return {self.category_name[0]:self.p_c1_post, self.category_name[1]:self.p_c2_post}


    def classify(self):
        self._cal_prior()
        self._cal_posterior()
        if self.p_c1_post >= self.p_c2_post:
            classified_as = self.category_name[0]
        else:
            classified_as = self.category_name[1]
        return classified_as


In [148]:
nbc = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5)

In [149]:
nbc.classify()

'spam'

In [150]:
nbc._cal_posterior()

{'normal': 0.12500000000000008, 'spam': 0.8749999999999999}