<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/yykim/Naive_Bayes_Classifier_%EA%B5%AC%ED%98%84_sklearn%EC%B6%94%EA%B0%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Naive Bayes Classifier 구현
##### Spam/Normal 분류 예시



In [90]:
import numpy as np

In [166]:
m1 = 'me free lottery'
m2 = 'free get free you'
m3 = 'you free scholarhip'
m4 = 'free to contact me'
m5 = 'you won award'
m6 = 'you ticket lottery'
labels = ['spam', 'spam', 'normal', 'normal', 'normal', 'spam']
mails = [m1, m2, m3, m4, m5, m6]

#토크나이징
for i, mail in enumerate(mails):
    mails[i] = mail.split(" ")
mails

In [167]:
#토크나이징
for i, mail in enumerate(mails):
    mails[i] = mail.split(" ")
mails

[['me', 'free', 'lottery'],
 ['free', 'get', 'free', 'you'],
 ['you', 'free', 'scholarhip'],
 ['free', 'to', 'contact', 'me'],
 ['you', 'won', 'award'],
 ['you', 'ticket', 'lottery']]

In [93]:
#토큰 개수 세기 
total_token = []
for mail in mails:
    for token in mail:
        total_token.append(token)
unique_token = np.unique(total_token)


In [94]:
#각각의 토큰 구하기
spam_token = []
normal_token = []
for i, mail in enumerate(mails):
    for token in mail:
        if labels[i] == 'spam':
            spam_token.append(token)
        else:
            normal_token.append(token)
print(spam_token)
print(normal_token)

['me', 'free', 'lottery', 'free', 'get', 'free', 'you', 'you', 'ticket', 'lottery']
['you', 'free', 'scholarhip', 'free', 'to', 'contact', 'me', 'you', 'won', 'award']


In [95]:
#사전확률 P(spam), P(normal) 구하기
p_spam = len(spam_token)/len(total_token)
p_normal = 1-p_spam
print(p_spam, p_normal)

0.5 0.5


In [96]:
#각각의 메일에서 토큰이 나온 횟수세기
spam_n_unique_token=[]
for token in unique_token:
    tmp = spam_token.count(token)
    spam_n_unique_token.append(tmp)

normal_n_unique_token=[]
for token in unique_token:
    tmp = normal_token.count(token)
    normal_n_unique_token.append(tmp)


In [97]:
#laplace smoothing위해 0.5와 1을 더해줌 
p_token_in_spam = (np.array(spam_n_unique_token)+0.5)/(sum(np.array(spam_n_unique_token))+1)
p_token_in_normal = (np.array(normal_n_unique_token)+0.5)/(sum(np.array(normal_n_unique_token))+1)
p_token_in_spam


array([0.04545455, 0.04545455, 0.31818182, 0.13636364, 0.22727273,
       0.13636364, 0.04545455, 0.13636364, 0.04545455, 0.04545455,
       0.22727273])

In [98]:
spam_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_spam)))
normal_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_normal)))
spam_dict_unique_token

{'award': -3.0910424533583156,
 'contact': -3.0910424533583156,
 'free': -1.1451323043030026,
 'get': -1.9924301646902063,
 'lottery': -1.4816045409242156,
 'me': -1.9924301646902063,
 'scholarhip': -3.0910424533583156,
 'ticket': -1.9924301646902063,
 'to': -3.0910424533583156,
 'won': -3.0910424533583156,
 'you': -1.4816045409242156}

In [99]:
#P(spam)*P(free|spam)*P(lottery|spam)
word_is_spam = np.log(p_spam) + spam_dict_unique_token['free'] + spam_dict_unique_token['lottery']
word_is_spam = np.exp(word_is_spam)
word_is_spam

0.03615702479338842

In [100]:
#P(normal)*P(free|normal)*P(lottery|normal)
word_is_normal = np.log(p_normal) + normal_dict_unique_token['free'] + normal_dict_unique_token['lottery']
word_is_normal = np.exp(word_is_normal)
word_is_normal

0.00516528925619835

In [101]:
#P(spam|words)
word_is_spam/(word_is_spam + word_is_normal)

0.8749999999999999

In [102]:
#P(normal|words)
word_is_normal/(word_is_spam + word_is_normal)

0.12500000000000008

### Class 만들기
- 이중분류

In [163]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self, docs = [], labels = [], words=[], k = 1, package= 'manual'):
    # docs는 토큰화한 문서들을 리스트화 한 이중 리스트
        self.docs = docs
        self.labels = labels
        self.k = k #laplace smoothing을 위한 상수 k
        self.words = words
        self.package = package
        self.p_category1 = 0 
        self.p_category2 = 0
        self.p_c1_post = 0
        self.p_c2_post = 0
        self.category_name=[]


    def _cal_prior(self):
        #토큰 개수 세기 
        total_token = []
        for doc in self.docs:
            for token in doc:
                total_token.append(token)
        unique_token = np.unique(total_token)        

        #토큰 분류
        self.category_name = np.unique(labels)
        c1_token = []
        c2_token = []
        for i, doc in enumerate(self.docs):
            for token in doc:
                if labels[i] == self.category_name[0]:
                    c1_token.append(token)
                else:
                    c2_token.append(token)

        #각각의 카테고리의 사전확률 계산 
        self.p_category1 = len(c1_token)/len(total_token)
        self.p_category2 = 1-self.p_category1
        return {self.category_name[0]:self.p_category1, self.category_name[1]:self.p_category2}

    def _cal_posterior(self):
        self._cal_prior()
        #문서 분류
        docs_1 = []
        docs_2 = []
        for i, doc in enumerate(self.docs):
            if labels[i] == self.category_name[0]:
                docs_1.append(doc)
            else:
                docs_2.append(doc)        

        #단어 토큰 개수 세기 
        total_token = []
        for doc in self.docs:
            for token in doc:
                total_token.append(token)
        unique_token = np.unique(total_token)

        category1_total_token = []
        for mail in docs_1:
            for token in mail:
                category1_total_token.append(token)
    
        category2_total_token = []
        for mail in docs_2:
            for token in mail:
                category2_total_token.append(token)

        #각각의 메일에서 토큰이 나온 횟수세기
        category1_n_unique_token=[]
        for token in unique_token:
            tmp = category1_total_token.count(token)
            category1_n_unique_token.append(tmp)

        category2_n_unique_token=[]
        for token in unique_token:
            tmp = category2_total_token.count(token)
            category2_n_unique_token.append(tmp)

        p_token_in_c1 = (np.array(category1_n_unique_token)+1*self.k)/(sum(np.array(category1_n_unique_token))+2*self.k)
        p_token_in_c2 = (np.array(category2_n_unique_token)+1*self.k)/(sum(np.array(category2_n_unique_token))+2*self.k)

        c1_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_c1)))
        c2_dict_unique_token = dict(zip(list(unique_token),np.log(p_token_in_c2)))

        word_is_c1 = np.log(self.p_category1)
        for word in self.words:
            word_is_c1 += c1_dict_unique_token[word] 
        word_is_c1 = np.exp(word_is_c1)

        word_is_c2 = np.log(self.p_category2) 
        for word in self.words:
            word_is_c2 += c2_dict_unique_token[word] 
        word_is_c2 = np.exp(word_is_c2)

        self.p_c1_post = word_is_c1/(word_is_c1 + word_is_c2)
        self.p_c2_post = word_is_c2/(word_is_c1 + word_is_c2)
        return {self.category_name[0]:self.p_c1_post, self.category_name[1]:self.p_c2_post}

    def _use_plain_sklearn(self):
        from sklearn.pipeline import Pipeline
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.naive_bayes import MultinomialNB

        text_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', MultinomialNB()), ])
        og_docs = [" ".join(i) for i in self.docs]
        self.docs = og_docs
        text_clf = text_clf.fit(self.docs, self.labels)

        self.words = [" ".join(self.words)]
        return text_clf.predict(self.words)

    def _use_gs_sklearn(self, parameters_dict):
        from sklearn.pipeline import Pipeline
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.naive_bayes import MultinomialNB

        text_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', MultinomialNB()), ])
        og_docs = [" ".join(i) for i in self.docs]
        self.docs = og_docs
        text_clf = text_clf.fit(self.docs, self.labels)

        
        # parameters_dict = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False)}
        from sklearn.model_selection import GridSearchCV
        gs_clf = GridSearchCV(text_clf, parameters_dict, n_jobs=-1, verbose=2)
        gs_clf = gs_clf.fit(self.docs, self.labels)

        self.words = [" ".join(self.words)]
        return gs_clf.best_estimator_.get_params(), gs_clf.predict(self.words)

    def classify(self):
        if self.package == 'manual':
            self._cal_prior()
            self._cal_posterior()
            if self.p_c1_post >= self.p_c2_post:
                classified_as = self.category_name[0]
            else:
                classified_as = self.category_name[1]
            return classified_as

        elif self.package == 'sklearn':
            return self._use_plain_sklearn()

        elif self.package == 'sklearn_gs':
            from ast import literal_eval
            parameters_dict = input("조정할 파라미터 딕셔너리를 입력해주세요: ")
            parameters_dict = literal_eval(parameters_dict) 
            return self._use_gs_sklearn(parameters_dict)

        else:
            print("지원하지 않는 방식입니다.")


In [121]:
nbc = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5)

In [122]:
nbc._cal_posterior()

{'normal': 0.12500000000000008, 'spam': 0.8749999999999999}

In [123]:
nbc.classify()

'spam'

In [124]:
nbc1 = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5, package='sklearn')

In [125]:
nbc1.classify()

array(['spam'], dtype='<U6')

In [168]:
nbc2 = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5, package='sklearn_gs')

In [169]:
nbc2.classify() ##데이터가 너무 적어서 오류 생김

조정할 파라미터 딕셔너리를 입력해주세요: {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


ValueError: ignored

In [129]:
nbc2 = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5, package='sklearn_gs')

In [130]:
nbc2.classify()

조정할 파라미터 딕셔너리를 입력해주세요: {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False)}


TypeError: ignored

In [171]:
m1 = 'me free lottery'
m2 = 'free get free you'
m3 = 'you free scholarhip'
m4 = 'free to contact me'
m5 = 'you won award'
m6 = 'you ticket lottery'
labels = ['spam', 'spam', 'normal', 'normal', 'normal', 'spam', 'spam', 'spam', 'normal', 'normal', 'normal', 'spam']
mails = [m1, m2, m3, m4, m5, m6, m1, m2, m3, m4, m5, m6]

#토크나이징
for i, mail in enumerate(mails):
    mails[i] = mail.split(" ")
mails

[['me', 'free', 'lottery'],
 ['free', 'get', 'free', 'you'],
 ['you', 'free', 'scholarhip'],
 ['free', 'to', 'contact', 'me'],
 ['you', 'won', 'award'],
 ['you', 'ticket', 'lottery'],
 ['me', 'free', 'lottery'],
 ['free', 'get', 'free', 'you'],
 ['you', 'free', 'scholarhip'],
 ['free', 'to', 'contact', 'me'],
 ['you', 'won', 'award'],
 ['you', 'ticket', 'lottery']]

In [172]:
#데이터 늘린 후 다시시도
nbc2 = NaiveBayesClassifier(docs=mails, labels=labels, words=['lottery', 'free'], k=0.5, package='sklearn_gs')

In [173]:
nbc2.classify()

조정할 파라미터 딕셔너리를 입력해주세요: {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.1s finished


({'clf': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'clf__alpha': 1.0,
  'clf__class_prior': None,
  'clf__fit_prior': True,
  'memory': None,
  'steps': [('vect',
    CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                    dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                    lowercase=True, max_df=1.0, max_features=None, min_df=1,
                    ngram_range=(1, 1), preprocessor=None, stop_words=None,
                    strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                    tokenizer=None, vocabulary=None)),
   ('tfidf',
    TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
   ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
  'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
  'tfidf__norm': 'l2',
  'tfidf__smooth_idf': True,
  'tfidf__sublinear_tf': False,
  'tfidf__use_idf': True,
  'v

In [137]:
l1 = "I love you"
l2 = "love happy weekend"
l3 = "bore work job"
l4 = "I hate you"
l5 = "bore weekend"
l6 = "happy together"
labels_s = ['긍정', '긍정', '부정', '부정', '부정', '긍정']
bunch = [l1, l2, l3, l4, l5, l6]

In [158]:
#spam/normal이 아닌 경우
ls = [i.split(" ") for i in bunch]
ls


[['I', 'love', 'you'],
 ['love', 'happy', 'weekend'],
 ['bore', 'work', 'job'],
 ['I', 'hate', 'you'],
 ['bore', 'weekend'],
 ['happy', 'together']]

In [159]:
nbc1 = NaiveBayesClassifier(docs=ls, labels=labels_s, words=['happy', 'weekend'], k=0.5)

In [160]:
nbc1._cal_prior()

{'긍정': 0.5, '부정': 0.5}

In [161]:
nbc1._cal_posterior()

{'긍정': 0.8333333333333334, '부정': 0.1666666666666666}

In [162]:
nbc1.classify()

'긍정'