<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/10_Naive_Bayes_Multi_Classifier_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Naive Bayes Calssifier를 이용한 다중 문항 분류**

## **1. Naive Bayes Classifier 다중 분류 구현**
**메일 필터링**

### **3-1. 데이터 전처리**

In [31]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam"]

In [32]:
# 각 문장 토큰화
lines = []
tokens = []

for i in range(len(mail)):
    lines.append(mail[i].split(" "))
    # 단어 뭉치 생성
    for word in lines[i]:
        tokens.append(word)
tokens = list(set(tokens))

# 메일의 종류
types = list(set(mail_type))

lines, tokens, types

([['me', 'free', 'lottery'],
  ['free', 'get', 'free', 'you'],
  ['you', 'free', 'scholarship'],
  ['free', 'to', 'contact', 'me'],
  ['you', 'won', 'award'],
  ['you', 'ticket', 'lottery']],
 ['me',
  'you',
  'scholarship',
  'ticket',
  'free',
  'to',
  'get',
  'lottery',
  'award',
  'contact',
  'won'],
 ['normal', 'spam'])

In [33]:
# 메일 분류 확인
import pandas as pd

mail_classify = {"메일" : lines, "분류" : mail_type}
df = pd.DataFrame(mail_classify)

df

Unnamed: 0,메일,분류
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam


In [34]:
# 토큰별 분류 카운트
import numpy as np

type_matrix = []

for tokens_idx in range(len(tokens)):
    type_count = [0 for a in range(len(types))]
    for type_idx in range(len(types)):
        for line_idx, label in enumerate(mail_classify["분류"]):
            if label == types[type_idx]:
                if tokens[tokens_idx] in lines[line_idx]:
                    type_count[type_idx] += lines[line_idx].count(tokens[tokens_idx])
    type_matrix.append(type_count)

matrix_table = pd.DataFrame(type_matrix, index=tokens, columns=types)
matrix_table

Unnamed: 0,normal,spam
me,1,1
you,2,2
scholarship,1,0
ticket,0,1
free,2,3
to,1,0
get,0,1
lottery,0,2
award,1,0
contact,1,0


In [35]:
# 사전확률 계산
prior_prob = []

for type_idx in types:
    count_type = 0
    for i in range(len(lines)):
        if type_idx == mail_classify['분류'][i]:
            count_type += 1

    prior_prob.append(count_type/len(lines))

prior_prob, types

([0.5, 0.5], ['normal', 'spam'])

In [36]:
# Laplace Smoothing
laplace = []
k = 0.5

for i in range(len(tokens)):
    laplace_tmp = []
    for j in range(len(types)):
        laplace_tmp.append((k+type_matrix[i][j])/(2*k+matrix_table[types[j]].sum())*100)
    laplace.append(laplace_tmp)

laplace_type = ["P(w|{})".format(i) for i in types]
laplace_type, laplace

(['P(w|normal)', 'P(w|spam)'],
 [[13.636363636363635, 13.636363636363635],
  [22.727272727272727, 22.727272727272727],
  [13.636363636363635, 4.545454545454546],
  [4.545454545454546, 13.636363636363635],
  [22.727272727272727, 31.818181818181817],
  [13.636363636363635, 4.545454545454546],
  [4.545454545454546, 13.636363636363635],
  [4.545454545454546, 22.727272727272727],
  [13.636363636363635, 4.545454545454546],
  [13.636363636363635, 4.545454545454546],
  [13.636363636363635, 4.545454545454546]])

In [37]:
# 로그 이용 언더 플로우
log = []

for i in range(len(tokens)):
    log_tmp = []
    for j in range(len(types)):
        log_tmp.append(np.log(laplace[i][j]/100))
    log.append(log_tmp)
log_type = ["Log(P(w|{}))".format(i) for i in types]
log_type, log

(['Log(P(w|normal))', 'Log(P(w|spam))'],
 [[-1.9924301646902063, -1.9924301646902063],
  [-1.4816045409242156, -1.4816045409242156],
  [-1.9924301646902063, -3.0910424533583156],
  [-3.0910424533583156, -1.9924301646902063],
  [-1.4816045409242156, -1.1451323043030026],
  [-1.9924301646902063, -3.0910424533583156],
  [-3.0910424533583156, -1.9924301646902063],
  [-3.0910424533583156, -1.4816045409242156],
  [-1.9924301646902063, -3.0910424533583156],
  [-1.9924301646902063, -3.0910424533583156],
  [-1.9924301646902063, -3.0910424533583156]])

In [38]:
word_df = pd.DataFrame(type_matrix, index=tokens, columns=types)
laplace_df = pd.DataFrame(laplace, index=tokens, columns=laplace_type)
log_df = pd.DataFrame(log, index=tokens, columns=log_type)

df = pd.concat((word_df, laplace_df, log_df), axis=1)
df

Unnamed: 0,normal,spam,P(w|normal),P(w|spam),Log(P(w|normal)),Log(P(w|spam))
me,1,1,13.636364,13.636364,-1.99243,-1.99243
you,2,2,22.727273,22.727273,-1.481605,-1.481605
scholarship,1,0,13.636364,4.545455,-1.99243,-3.091042
ticket,0,1,4.545455,13.636364,-3.091042,-1.99243
free,2,3,22.727273,31.818182,-1.481605,-1.145132
to,1,0,13.636364,4.545455,-1.99243,-3.091042
get,0,1,4.545455,13.636364,-3.091042,-1.99243
lottery,0,2,4.545455,22.727273,-3.091042,-1.481605
award,1,0,13.636364,4.545455,-1.99243,-3.091042
contact,1,0,13.636364,4.545455,-1.99243,-3.091042


In [39]:
# 스팸 필터링 : 입력값
check_list = "free lottery"

check_token = []

check_token.append(check_list.split(" "))

check_token

[['free', 'lottery']]

In [110]:
# 입력 토큰 로그들의 합
import math

filter = []
tmp = []
for i in range(len(check_token[0])):
    tmp.append(df[df.index==check_token[0][i]][("Log(P(w|{}))".format(i) for i in types)].sum())

for i in range(len(types)):
    filter.append(math.exp(sum(tmp[i]) + np.log(prior_prob[i])))

prob = []

for i in range(len(types)):
    prob.append(filter[i] / sum(filter))

filter, prob

([0.03615702479338842, 0.00516528925619835],
 [0.8749999999999999, 0.12500000000000008])

In [117]:
for i in range(len(check_token[0])):
    print("{}라는 토큰이 있는 메일이 스팸일 확률 : {:.2f}%".format(check_token[0][i], prob[i]*100))

free라는 토큰이 있는 메일이 스팸일 확률 : 87.50%
lottery라는 토큰이 있는 메일이 스팸일 확률 : 12.50%
