<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HyeonminNam/200724_NaiveBayesClassifier_for_spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes Classifier

## 스팸/정상 메일 표 만들기

In [None]:
import pandas as pd

In [None]:
text = ['me free lottery', 'free get free you', 'you free scholarship', 'free to contact me', 'you won award', 'you ticket lottery']
spam = ['spam', 'spam', 'normal', 'normal', 'normal', 'spam']

In [None]:
df = pd.DataFrame({'tokens': text, 'class':spam})

In [None]:
df

Unnamed: 0,tokens,class
0,me free lottery,spam
1,free get free you,spam
2,you free scholarship,normal
3,free to contact me,normal
4,you won award,normal
5,you ticket lottery,spam


## 데이터에서 단어, 클래스 빈도 정보 추출

In [None]:
# 전체 단어, 클래스 리스트
word_lst = []
class_lst = []
for x in df['tokens']:
    word_lst += x.split()
for x in df['class']:
    class_lst += x.split()

In [None]:
# 단어별 스팸, 정상 빈도 정리한 사전 구축
token_dic = {}
token_lst = set(word_lst)
spam_num, normal_num = 0, 0
for x in token_lst:
    token_dic[x] = [0, 0]
for i, row in df.iterrows():
    tokens = row['tokens'].split()
    if row['class'] == 'spam':
        for x in tokens:
            token_dic[x][0] += 1
            spam_num += 1
    else:
        for x in tokens:
            token_dic[x][1] += 1
            normal_num += 1

In [None]:
# 단어별 스팸, 정상 빈도 사전
print(token_dic)

# 전체 스팸, 정상 빈도
print(spam_num, normal_num)

# 전체 스팸, 정상 확률
print(p_spam, p_normal)

{'me': [1, 1], 'free': [3, 2], 'to': [0, 1], 'contact': [0, 1], 'won': [0, 1], 'get': [1, 0], 'award': [0, 1], 'lottery': [2, 0], 'ticket': [1, 0], 'scholarship': [0, 1], 'you': [2, 2]}
10 10
0.5 0.5


## 단어별 확률 정보 데이터프레임

In [None]:
# 단어별 확률 정보 추가할 빈 데이터프레임 만들기
df2 = pd.DataFrame(index=token_lst, columns=['spam', 'normal', 'p_w_spam', 'p_w_normal'])

In [None]:
df2

Unnamed: 0,spam,normal,p_w_spam,p_w_normal
me,,,,
free,,,,
to,,,,
contact,,,,
won,,,,
get,,,,
award,,,,
lottery,,,,
ticket,,,,
scholarship,,,,


In [None]:
# 위에서 구축한 빈도 사전 참고해서 데이터프레임에 정보 추가
for word, cnt in token_dic.items():
    df2.loc[word]['spam'] = cnt[0]
    df2.loc[word]['normal'] = cnt[1]

In [None]:
df2

Unnamed: 0,spam,normal,p_w_spam,p_w_normal
me,1,1,,
free,3,2,,
to,0,1,,
contact,0,1,,
won,0,1,,
get,1,0,,
award,0,1,,
lottery,2,0,,
ticket,1,0,,
scholarship,0,1,,


In [None]:
# Laplace smoothing
k = 0.5
df2['p_w_spam'] = (k + df2['spam']) / (2*k+spam_num)
df2['p_w_normal'] = (k + df2['normal']) / (2*k+normal_num)

In [None]:
df2

Unnamed: 0,spam,normal,p_w_spam,p_w_normal
me,1,1,0.136364,0.136364
free,3,2,0.318182,0.227273
to,0,1,0.0454545,0.136364
contact,0,1,0.0454545,0.136364
won,0,1,0.0454545,0.136364
get,1,0,0.136364,0.0454545
award,0,1,0.0454545,0.136364
lottery,2,0,0.227273,0.0454545
ticket,1,0,0.136364,0.0454545
scholarship,0,1,0.0454545,0.136364


In [None]:
# log 이용 언더 플로우 방지
import math
df2['log_pwspam'] = [math.log(x) for x in df2['p_w_spam']]
df2['log_pwnormal'] = [math.log(x) for x in df2['p_w_normal']]


In [None]:
df2

Unnamed: 0,spam,normal,p_w_spam,p_w_normal,log_pwspam,log_pwnormal
me,1,1,0.136364,0.136364,-1.99243,-1.99243
free,3,2,0.318182,0.227273,-1.145132,-1.481605
to,0,1,0.0454545,0.136364,-3.091042,-1.99243
contact,0,1,0.0454545,0.136364,-3.091042,-1.99243
won,0,1,0.0454545,0.136364,-3.091042,-1.99243
get,1,0,0.136364,0.0454545,-1.99243,-3.091042
award,0,1,0.0454545,0.136364,-3.091042,-1.99243
lottery,2,0,0.227273,0.0454545,-1.481605,-3.091042
ticket,1,0,0.136364,0.0454545,-1.99243,-3.091042
scholarship,0,1,0.0454545,0.136364,-3.091042,-1.99243


## 입력된 문구 활용해서 스팸 확률 계산

In [None]:
# 스팸 메일 판단에 활용할 문구
input_token = 'free lottery'
input_token_lst = input_token.split()

In [None]:
# 전체 스팸, 정상 확률(p(spam), p(normal))의 log값
p_spam = class_lst.count('spam')/len(class_lst)
p_normal = class_lst.count('normal')/len(class_lst)
spam_perc, normal_perc = math.log(p_spam), math.log(p_normal)

In [None]:
# 각 단어별 스팸, 정상 확률의 log값 더해주기
for x in input_token_lst:
    spam_perc += df2['log_pwspam'][x]    
    normal_perc += df2['log_pwnormal'][x]
spam_perc = math.exp(spam_perc)
normal_perc = math.exp(normal_perc)

In [None]:
# 최종 계산된 스팸 확률
spam_perc

0.03615702479338842

In [None]:
# 최종 계산된 정상 확률
normal_perc

0.00516528925619835

In [None]:
word_spam_p = spam_perc/(spam_perc + normal_perc)
word_normal_p = normal_perc/(spam_perc + normal_perc)

In [None]:
print('스팸 확률: {}%'.format(word_spam_p*100))
print('정상 확률: {}%'.format(word_normal_p*100))

스팸 확률: 87.49999999999999%
정상 확률: 12.500000000000009%
