<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HTLim/NLP_NBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

스팸 분류기

In [210]:
# import package
import pandas as pd
import numpy as np

In [297]:
class NBC():

    def __init__(self):
        self.mails = []
        self.tokens = set({})
        self.tokens_freq = pd.DataFrame([],columns=['token', 'spam', 'ham'], dtype='int32')


    # 딕셔너리 형태로 mail input
    def input_data(self, mail, mail_class):
        self.mails.append({mail_class: mail})

    # 모든 메일 토크나이징
    def tokenizer(self):
        # 토큰분리
        for mail in self.mails:
            mail_token = set(list(mail.values())[0].split())
            self.tokens = self.tokens.union(mail_token)
        
        # 분류별 토큰 빈도 수 계산
        for token in self.tokens:
            token_dict = {"token": token, 'spam':0, 'ham':0}
            for mail in self.mails:
                if token in list(mail.values())[0]:
                    mail_type = list(mail.keys())[0]
                    count = list(mail.values())[0].count(token)
                    token_dict[mail_type] += count

            self.tokens_freq = self.tokens_freq.append(token_dict, ignore_index=True)
        
    # 확률계산(k = Laplace smoothing)
    def fit(self, k):
        self.tokens_freq['wspam'] = np.log((k + self.tokens_freq['spam'])/(2 * k + sum(self.tokens_freq['spam'])))
        self.tokens_freq['wham'] = np.log((k + self.tokens_freq['ham'])/(2 * k + sum(self.tokens_freq['ham'])))
        
    # 스팸 분류기
    def classify(self, mail):
        mail_token = mail.split()
        p_ham = 0
        p_spam = 0
        for token in mail_token:
            if token in self.tokens:
                row = self.tokens_freq.loc[self.tokens_freq['token'] == token]
                
                p_ham += float(row.wham)
                p_spam += float(row.wspam)
        
        p_ham = np.exp(p_ham)
        p_spam = np.exp(p_spam)
        
        result = p_spam / (p_ham + p_spam)
        
        return result

In [298]:
m1 = 'me free lottery'
m2 = 'free get free you'
m3 = 'you free scholarship'
m4 = 'free to contact me'
m5 = 'you won award'
m6 = 'you ticket lottery'

In [299]:
nbc = NBC()
nbc.input_data(m1, 'spam')
nbc.input_data(m2, 'spam')
nbc.input_data(m3, 'ham')
nbc.input_data(m4, 'ham')
nbc.input_data(m5, 'ham')
nbc.input_data(m6, 'spam')

In [300]:
nbc.mails

[{'spam': 'me free lottery'},
 {'spam': 'free get free you'},
 {'ham': 'you free scholarship'},
 {'ham': 'free to contact me'},
 {'ham': 'you won award'},
 {'spam': 'you ticket lottery'}]

In [301]:
nbc.tokenizer()
nbc.tokens

{'award',
 'contact',
 'free',
 'get',
 'lottery',
 'me',
 'scholarship',
 'ticket',
 'to',
 'won',
 'you'}

In [302]:
print(sum(nbc.tokens_freq['spam']))
nbc.tokens_freq

10


Unnamed: 0,token,spam,ham
0,me,1,1
1,won,0,1
2,you,2,2
3,ticket,1,0
4,award,0,1
5,contact,0,1
6,scholarship,0,1
7,lottery,2,0
8,to,0,1
9,free,3,2


In [303]:
nbc.fit(0.5)
nbc.tokens_freq

Unnamed: 0,token,spam,ham,wspam,wham
0,me,1,1,-1.99243,-1.99243
1,won,0,1,-3.091042,-1.99243
2,you,2,2,-1.481605,-1.481605
3,ticket,1,0,-1.99243,-3.091042
4,award,0,1,-3.091042,-1.99243
5,contact,0,1,-3.091042,-1.99243
6,scholarship,0,1,-3.091042,-1.99243
7,lottery,2,0,-1.481605,-3.091042
8,to,0,1,-3.091042,-1.99243
9,free,3,2,-1.145132,-1.481605


In [305]:
mail = 'me free lottery'
result = nbc.classify(mail)
print("해당 메일은 {}% 로 Spam 입니다.".format(result*100))

해당 메일은 87.5% 로 Spam 입니다.
