<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HTLim/NLP_Multi_NBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

스팸 분류기

In [21]:
# import package
import pandas as pd
import numpy as np
from collections import defaultdict

In [273]:
class NBC():

    def __init__(self):
        self.mails = []
        self.category_len = 0
        self.tokens = set({})
        self.tokens_freq = pd.DataFrame([],columns=['token'], dtype='int32')


    # 딕셔너리 형태로 mail input
    def input_data(self, mail, mail_class):
        self.mails.append({mail_class: mail})

    # 모든 메일 토크나이징
    def tokenizer(self):
        # 토큰분리
        for mail in self.mails:
            mail_token = set(list(mail.values())[0].split())
            self.tokens = self.tokens.union(mail_token)
        
        # 분류별 토큰 빈도 수 계산
        for token in self.tokens:
            token_dict = defaultdict(int, token=token)
            for mail in self.mails:
                if token in list(mail.values())[0]:
                    mail_type = list(mail.keys())[0]
                    count = list(mail.values())[0].count(token)
                    token_dict[mail_type] += count

            self.tokens_freq = self.tokens_freq.append(token_dict, ignore_index=True)

        # NaN 값 0 으로 채우기  
        self.tokens_freq = self.tokens_freq.fillna(0)

    # 확률계산(k = Laplace smoothing)
    def fit(self, k):
        columns = list(self.tokens_freq.columns) 
        self.category_len = len(columns)
        for column in columns[1:]:
            self.tokens_freq['w'+column] = np.log((k + self.tokens_freq[column])/(2 * k + sum(self.tokens_freq[column])))
            # self.tokens_freq['w'+self.pos] = np.log((k + self.tokens_freq[self.pos])/(2 * k + sum(self.tokens_freq[self.pos])))
        
    # 스팸 분류기
    def classify(self, mail):
        mail_token = mail.split()
        classifier = defaultdict(float)
        columns = list(self.tokens_freq.columns)[1:self.category_len]

        for token in mail_token:
            if token in self.tokens:
                row = self.tokens_freq.loc[self.tokens_freq['token'] == token]
                for column in columns:
                    classifier[column] += float(row['w'+column])
        
        percent_dict = defaultdict(float)
        for column in columns:
            percent_dict[column] = np.exp(classifier[column])
        
        # 가장 높은 확률 카테고리
        classy = max(percent_dict.keys(), key=(lambda k:percent_dict[k]))
        sum_ = sum(np.asarray(list(percent_dict.values())))
        result = percent_dict[classy] / sum_
        
        return result, classy

In [274]:
m1 = 'I love you'
m2 = 'love happy weekend'
m3 = 'bore work job'
m4 = 'I hate you'
m5 = 'bore weekend'
m6 = 'happy together'

In [275]:
nbc = NBC()
nbc.input_data(m1, 'pos')
nbc.input_data(m2, 'neutral')
nbc.input_data(m3, 'neutral')
nbc.input_data(m4, 'neg')
nbc.input_data(m5, 'neg')
nbc.input_data(m6, 'pos')

In [276]:
nbc.mails

[{'pos': 'I love you'},
 {'neutral': 'love happy weekend'},
 {'neutral': 'bore work job'},
 {'neg': 'I hate you'},
 {'neg': 'bore weekend'},
 {'pos': 'happy together'}]

In [277]:
nbc.tokenizer()
nbc.tokens

{'I',
 'bore',
 'happy',
 'hate',
 'job',
 'love',
 'together',
 'weekend',
 'work',
 'you'}

In [278]:
# print(sum(nbc.tokens_freq['spam']))
nbc.tokens_freq

Unnamed: 0,token,neutral,pos,neg
0,love,1.0,1.0,0.0
1,weekend,1.0,0.0,1.0
2,hate,0.0,0.0,1.0
3,job,1.0,0.0,0.0
4,work,1.0,0.0,0.0
5,bore,1.0,0.0,1.0
6,together,0.0,1.0,0.0
7,you,0.0,1.0,1.0
8,happy,1.0,1.0,0.0
9,I,0.0,1.0,1.0


In [279]:
nbc.fit(0.5)
nbc.tokens_freq

Unnamed: 0,token,neutral,pos,neg,wneutral,wpos,wneg
0,love,1.0,1.0,0.0,-1.540445,-1.386294,-2.484907
1,weekend,1.0,0.0,1.0,-1.540445,-2.484907,-1.386294
2,hate,0.0,0.0,1.0,-2.639057,-2.484907,-1.386294
3,job,1.0,0.0,0.0,-1.540445,-2.484907,-2.484907
4,work,1.0,0.0,0.0,-1.540445,-2.484907,-2.484907
5,bore,1.0,0.0,1.0,-1.540445,-2.484907,-1.386294
6,together,0.0,1.0,0.0,-2.639057,-1.386294,-2.484907
7,you,0.0,1.0,1.0,-2.639057,-1.386294,-1.386294
8,happy,1.0,1.0,0.0,-1.540445,-1.386294,-2.484907
9,I,0.0,1.0,1.0,-2.639057,-1.386294,-1.386294


In [282]:
mail = 'happy weekend'
result, category = nbc.classify(mail)
print("해당 메일은 {}% 로 {} 입니다.".format(round(result*100,2), category))

해당 메일은 52.43% 로 neutral 입니다.
