<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/10_Naive_Bayes_Classifier_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Document Classification**

## **1. Naive Bayes Classifier 구현**
**스팸 메일 필터링**

### **1-1. 데이터 전처리**

In [24]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam"]

In [80]:
# 각 문장 토큰화
lines = []
tokens = []

for i in range(len(mail)):
    lines.append(mail[i].split(" "))
    # 단어 뭉치 생성
    for word in lines[i]:
        tokens.append(word)
tokens = list(set(tokens))

# 메일의 종류
types = list(set(mail_type))

lines, tokens, types

([['me', 'free', 'lottery'],
  ['free', 'get', 'free', 'you'],
  ['you', 'free', 'scholarship'],
  ['free', 'to', 'contact', 'me'],
  ['you', 'won', 'award'],
  ['you', 'ticket', 'lottery']],
 ['award',
  'lottery',
  'free',
  'get',
  'you',
  'scholarship',
  'to',
  'won',
  'contact',
  'ticket',
  'me'],
 ['spam', 'normal'])

In [54]:
# 메일 분류 확인
import pandas as pd

mail_classify = {"메일" : lines, "분류": mail_type}
df = pd.DataFrame(mail_classify)

df

Unnamed: 0,메일,분류
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam


In [55]:
# spam, normal 메일 개수
num_spam = 0
num_norm = 0

for label in mail_classify["분류"]:
    if label == "spam":
        num_spam += 1
    if label == "normal":
        num_norm += 1

# 총 메일 개수
num_total = num_spam + num_norm

# 사전 확률 계산
spam_prior = num_spam / num_total
norm_prior = num_norm / num_total

spam_prior, norm_prior

(0.5, 0.5)

In [56]:
import numpy as np

spam_word = []
norm_word = []
total_spam = 0
total_norm = 0

for i in range(len(tokens)):
    count_spam = 0
    count_norm = 0
    for j in range(len(lines)):
        if mail_classify["분류"][j] == "spam":
            if tokens[i] in lines[j]:
                count_spam += lines[j].count(tokens[i])
        if mail_classify["분류"][j] == "normal":
            if tokens[i] in lines[j]:
                count_norm += lines[j].count(tokens[i])

    spam_word.append(count_spam)
    norm_word.append(count_norm)
    total_spam += count_spam
    total_norm += count_norm

total_spam, total_norm

(10, 10)

### **1-2. Laplace Smoothing**

In [57]:
# Laplace Smoothing
laplace_spam = []
laplace_norm = []

for i in range(len(tokens)):
    laplace_spam.append((spam_prior+spam_word[i])/(2*spam_prior+total_spam)*100)

for i in range(len(tokens)):
    laplace_norm.append((norm_prior+norm_word[i])/(2*norm_prior+total_norm)*100)

In [58]:
word_table = np.array([spam_word, norm_word, laplace_spam, laplace_norm])

df = pd.DataFrame(word_table.T, index=tokens, columns=["spam", "normal", "P(w|spam)", "P(w|normal)"])
df.sort_index(axis=0)

Unnamed: 0,spam,normal,P(w|spam),P(w|normal)
award,0.0,1.0,4.545455,13.636364
contact,0.0,1.0,4.545455,13.636364
free,3.0,2.0,31.818182,22.727273
get,1.0,0.0,13.636364,4.545455
lottery,2.0,0.0,22.727273,4.545455
me,1.0,1.0,13.636364,13.636364
scholarship,0.0,1.0,4.545455,13.636364
ticket,1.0,0.0,13.636364,4.545455
to,0.0,1.0,4.545455,13.636364
won,0.0,1.0,4.545455,13.636364


### **1-3. Log 이용**
**Log의 성질을 활용. 곱셈을 덧셈으로 변환해 Underflow를 방지함**

In [59]:
# 로그 이용 언더 플로우
log_spam = []
log_norm = []

for i in range(len(tokens)):
    log_spam.append(np.log(laplace_spam[i]/100))
    log_norm.append(np.log(laplace_norm[i]/100))

In [60]:
word_table = np.array([spam_word, norm_word, laplace_spam, laplace_norm, log_spam, log_norm])

df = pd.DataFrame(word_table.T, index=tokens, columns=["spam", "normal", "P(w|spam)", "P(w|normal)", "Log(P(w|spam))", "Log(P(w|normal))"])
df.sort_index(axis=0)

Unnamed: 0,spam,normal,P(w|spam),P(w|normal),Log(P(w|spam)),Log(P(w|normal))
award,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
contact,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
free,3.0,2.0,31.818182,22.727273,-1.145132,-1.481605
get,1.0,0.0,13.636364,4.545455,-1.99243,-3.091042
lottery,2.0,0.0,22.727273,4.545455,-1.481605,-3.091042
me,1.0,1.0,13.636364,13.636364,-1.99243,-1.99243
scholarship,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
ticket,1.0,0.0,13.636364,4.545455,-1.99243,-3.091042
to,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
won,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243


### **1-4. 스팸 확률 구하기**

In [50]:
# 스팸 필터링 : 입력값
check_list = "free lottery"

check_token = []

check_token.append(check_list.split(" "))

check_token

[['free', 'lottery']]

In [65]:
# 입력 토큰 로그들의 합
import math

spam_filter = 0
norm_filter = 0

for i in range(len(check_token[0])):
    spam_filter += df['Log(P(w|spam))'][check_token[0][i]]
    norm_filter += df['Log(P(w|normal))'][check_token[0][i]]

spam_filter, norm_filter

(0.2727272727272727, -4.572646994282531)

In [52]:
# 입력 토큰 로그합과 사전확률 로그의 합
spam_filter = math.exp(spam_filter + np.log(spam_prior))
norm_filter = math.exp(norm_filter + np.log(norm_prior))

spam_filter, norm_filter

(0.5601723263736278, 0.00516528925619835)

In [None]:
# 스팸/정상 메일일 확률
spam_prob = spam_filter / (spam_filter + norm_filter)
norm_prob = norm_filter / (spam_filter + norm_filter)

spam_prob, norm_prob

(0.8749999999999999, 0.12500000000000008)

### **1-5. 최종 결과**

In [None]:
print("{}라는 토큰이 있는 메일이 스팸일 확률 : {:.2f}%".format(check_list, spam_prob*100))
print("{}라는 토큰이 있는 메일이 정상일 확률 : {:.2f}%".format(check_list, norm_prob*100))

free lottery라는 토큰이 있는 메일이 스팸일 확률 : 87.50%
free lottery라는 토큰이 있는 메일이 정상일 확률 : 12.50%


## **2. Class를 이용한 Naive Bayes Classifier 구현**

In [52]:
import pandas as pd
import numpy as np
import math

class NBC():
    def __init__(self):
        self.lines = []
        self.tokens = []
        self.mail_classify = {}
        self.spam_prior = 0
        self.norm_prior = 0
        self.spam_word = []
        self.norm_word = []
        self.total_spam = 0
        self.total_norm = 0
        self.df = []
        self.laplace_spam = []
        self.laplace_norm = []
        self.log_spam = []
        self.log_norm = []
        self.check_token = []
        self.spam_filter = 0
        self.norm_filter = 0
        self.spam_prob = 0
        self.nrom_prob = 0

    # 메일 분류
    def classify_mail(self, mail, mail_type):
        for i in range(len(mail)):
            self.lines.append(mail[i].split(" "))
            # 단어 뭉치 생성
            for word in self.lines[i]:
                self.tokens.append(word)
        self.tokens = list(set(self.tokens))
        # 메일 분류 확인
        self.mail_classify = {"메일" : self.lines, "분류": mail_type}

        return lines, self.tokens, self.mail_classify

    # 사전 확률과 토큰별 메일 개수 계산
    def count_mail(self):
        for i in range(len(self.tokens)):
            count_spam = 0
            count_norm = 0
            num_spam = 0
            num_norm = 0
            for j in range(len(self.lines)):
                if self.mail_classify["분류"][j] == "spam":
                    num_spam += 1
                    if self.tokens[i] in self.lines[j]:
                        count_spam += self.lines[j].count(self.tokens[i])
                if self.mail_classify["분류"][j] == "normal":
                    num_norm += 1
                    if self.tokens[i] in self.lines[j]:
                        count_norm += self.lines[j].count(self.tokens[i])
            # 단어별 스팸, 정상 메일 개수 계산
            self.spam_word.append(count_spam)
            self.norm_word.append(count_norm)
            # 총 스팸, 정상 단어 개수
            self.total_spam += count_spam
            self.total_norm += count_norm
        # 사전 확률 계산
        self.norm_prior = num_norm / (num_spam + num_norm)
        self.spam_prior = num_spam / (num_spam + num_norm)

        return self.spam_prior, self.norm_prior, self.total_spam, self.total_norm

    # Laplace Smoothing 계산
    def laplace_smoothing(self, k):
        for i in range(len(self.tokens)):
            self.laplace_spam.append((k+self.spam_word[i])/(2*self.spam_prior+self.total_spam)*100)
            self.laplace_norm.append((k+self.norm_word[i])/(2*self.norm_prior+self.total_norm)*100)
        
        return self.laplace_spam, self.laplace_norm

    # 로그 이용 언더 플로우
    def log_calculate(self):
        for i in range(len(self.tokens)):
            self.log_spam.append(np.log(self.laplace_spam[i]/100))
            self.log_norm.append(np.log(self.laplace_norm[i]/100))
        # 결과 출력
        word_table = np.array([self.spam_word, self.norm_word, self.laplace_spam, self.laplace_norm, self.log_spam, self.log_norm])
        self.df = pd.DataFrame(word_table.T, index=self.tokens, columns=["spam", "normal", "P(w|spam)", "P(w|normal)", "Log(P(w|spam))", "Log(P(w|normal))"])
        
        return self.df.sort_index(axis=0)

    # 스팸 필터링 : 입력값
    def list_check(self, check_list):
        self.check_token.append(check_list.split(" "))

        return self.check_token

    # 스팸 필터링
    def spam_filtering(self):
        for i in range(len(self.check_token[0])):
            self.spam_filter += self.df['Log(P(w|spam))'][self.check_token[0][i]]
            self.norm_filter += self.df['Log(P(w|normal))'][self.check_token[0][i]]
        # 입력 토큰 로그합과 사전확률 로그의 합
        self.spam_filter = math.exp(self.spam_filter + np.log(self.spam_prior))
        self.norm_filter = math.exp(self.norm_filter + np.log(self.norm_prior))

        return self.spam_filter, self.norm_filter
    
    # 스팸/정상 메일일 확률
    def spam_probability(self):
        self.spam_prob = self.spam_filter / (self.spam_filter + self.norm_filter)
        self.norm_prob = self.norm_filter / (self.spam_filter + self.norm_filter)

        return self.spam_prob, self.norm_prob

    # 결과값 출력
    def spam_result(self):
        print("{}라는 토큰이 있는 메일이 스팸일 확률 : {:.2f}%".format(self.check_token[0], self.spam_prob*100))
        print("{}라는 토큰이 있는 메일이 정상일 확률 : {:.2f}%".format(self.check_token[0], self.norm_prob*100))

    def run(self, mail, mail_type, check_list, k):
        self.classify_mail(mail, mail_type)
        self.count_mail()
        self.laplace_smoothing(k)
        self.log_calculate()
        self.list_check(check_list)
        self.spam_filtering()
        self.spam_probability()
        self.spam_result()

In [53]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam"]

check = "get me"

In [54]:
nbc = NBC()

In [55]:
nbc.run(mail, mail_type, check, 0.5)

['get', 'me']라는 토큰이 있는 메일이 스팸일 확률 : 75.00%
['get', 'me']라는 토큰이 있는 메일이 정상일 확률 : 25.00%


## **3. Naive Bayes Classifier 다중 분류 구현**
**메일 필터링**

### **3-1. 데이터 전처리**

In [40]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery',
        'lottery baseball free']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam", "sports"]

In [41]:
# 각 문장 토큰화
lines = []
tokens = []

for i in range(len(mail)):
    lines.append(mail[i].split(" "))
    # 단어 뭉치 생성
    for word in lines[i]:
        tokens.append(word)
tokens = list(set(tokens))

# 메일의 종류
types = list(set(mail_type))

lines, tokens, types

([['me', 'free', 'lottery'],
  ['free', 'get', 'free', 'you'],
  ['you', 'free', 'scholarship'],
  ['free', 'to', 'contact', 'me'],
  ['you', 'won', 'award'],
  ['you', 'ticket', 'lottery'],
  ['lottery', 'baseball', 'free']],
 ['you',
  'contact',
  'baseball',
  'me',
  'free',
  'scholarship',
  'ticket',
  'get',
  'award',
  'won',
  'lottery',
  'to'],
 ['spam', 'sports', 'normal'])

In [42]:
# 메일 분류 확인
import pandas as pd

mail_classify = {"메일" : lines, "분류" : mail_type}
df = pd.DataFrame(mail_classify)

df

Unnamed: 0,메일,분류
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam
6,"[lottery, baseball, free]",sports


In [44]:
import numpy as np

type_matrix = []

for tokens_idx in range(len(tokens)):
    type_count = [0 for a in range(len(types))]
    for type_idx in range(len(types)):
        for line_idx, label in enumerate(mail_classify["분류"]):
            if label == types[type_idx]:
                if tokens[tokens_idx] in lines[line_idx]:
                    type_count[type_idx] += 1
    type_matrix.append(type_count)

matrix_table = pd.DataFrame(type_matrix, index=tokens, columns=["spam", "sports", "normal"])
matrix_table

Unnamed: 0,spam,sports,normal
you,2,0,2
contact,0,0,1
baseball,0,1,0
me,1,0,1
free,2,1,2
scholarship,0,0,1
ticket,1,0,0
get,1,0,0
award,0,0,1
won,0,0,1


In [47]:
prior_prob = []

for type_idx in range(len(types)):
    
    prior_prob.append(matrix_table[types[i]].sum())

prior_prob

[10, 10, 10]

In [None]:
# Laplace Smoothing
laplace_spam = []
laplace_norm = []

for i in range(len(tokens)):
    laplace_spam.append((spam_prior+spam_word[i])/(2*spam_prior+total_spam)*100)
    laplace_norm.append((norm_prior+norm_word[i])/(2*norm_prior+total_norm)*100)

## **4. sklearn을 활용**

### **4-1. 뉴스 데이터 다운로드**

In [16]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset = 'train', shuffle = True)
print(twenty_train.target_names) # 뉴스 카테고리 출력
print(twenty_train.data[0]) # 뉴스 데이터 출력

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have o

### **4-2. 문서 분류(파이프 라인 이용)**

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clif', MultinomialNB()), ])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [18]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

### **4-3. Grid Search**

In [22]:
from sklearn.model_selection import GridSearchCV
parameters_clf = {'vect__ngram_range' : [(1, 1), (1, 2)],
                 'tfidf__use_idf' : (True, False),
}
gs_clf = GridSearchCV(text_clf, parameters_clf, n_jobs=-1, verbose=2)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print("Best Score : {0}".format(gs_clf.best_score_))
print("Best Parameters Set : ")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print("\t{0} : {1}".format(param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.0min finished


Best Score : 0.8518650274101537
Best Parameters Set : 
	clif : MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
	clif__alpha : 1.0
	clif__class_prior : None
	clif__fit_prior : True
	memory : None
	steps : [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clif', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]
	tfidf : TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
	tfidf__norm : l2
	tfidf__smooth_idf : True
	tfidf__sublinear_tf : False
	tfidf__use_idf : True
	vect : Count

### **4-4. Parameter 적용**

In [23]:
import numpy as np
predicted = gs_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.765400955921402