# 나이브 베이즈 분류기 기본

## 2018년 1월 1일 안상호 

## 1. 모델 
[조대협의 블로그](http://bcho.tistory.com/1010)

`word`가 주어졌을 때, `Category` 예측 이진 분류

$${\displaystyle p(c_1|d)={\frac {p(c_1, d)}{p(d)}}={\frac {p(d|c_1)p(c_1)}{p(d)}}}$$
$${\displaystyle p(c_2|d)={\frac {p(d|c_2)p(c_2)}{p(d)}}}$$

### 필요한 것들

1. $P(Comedy)$, $P(Action)$
2. $P(word|Comedy)$, $P(word|Action)$

### 1.1. $P(Comedy)$, $P(Action)$

In [43]:
import pandas as pd

review = pd.read_csv("data/naive_ex.csv", encoding="utf8")

review.head()

Unnamed: 0,movie,word,Rec
0,1,"fun, couple, love, love",Comedy
1,2,"fast, furious, shoot",Action
2,3,"couple, fly, fast, fun, fun",Comedy
3,4,"furious, shoot, shoot, fun",Action
4,5,"fly, fast, shoot, love",Action


In [44]:
total = review.shape[0]
num_Comedy = sum(review["Rec"] == "Comedy")
num_Action = total - num_Comedy

print(num_Comedy,num_Action)

(2, 3)


In [45]:
from __future__ import division

P_C = num_Comedy/total
P_A = num_Action/total
print(P_C, P_A)

(0.4, 0.6)


### 1.2. $P(word|Comedy)$, $P(word|Action)$

#### $P(word|Comedy)$

$P(word, Comedy)$

- word 전처리

In [104]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords = False):
    # review_text = re.sub("Review ", "", review)
    review_text = BeautifulSoup(review).get_text() # 텍스트 추출
    review_text = re.sub(" [^a-zA-Z]", " ", review_text) # 기호문자 제거
    words = review_text.lower().split() # 소문자 변환 후 분리
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

In [106]:
import nltk.data
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# 토크나이즈 함수생성(리뷰->토큰)
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    # 1. 문단을 문장으로 스플릿
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. 문장들을 빈리스트에 하나씩 추가
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0: # 비어있는 문장은 스킵
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences # (each sentence is a list of words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [110]:
sentences = []

print("Parsing sentences from review_set")
for rev in review["rev_text"]:
    sentences += review_to_sentences(rev, tokenizer)

Parsing sentences from review_set


AttributeError: 'float' object has no attribute 'strip'

[u'fun', u'couple', u'love', u'love']

In [86]:

Comedy_df = review[review["Rec"] == "Comedy"].copy()

word_num = 0
word_count_dict = {}
for i in Comedy_df.index:
    a = sentences[i]
        
    for w in set(a):
        try:
            word_count_dict[w] += a.count(w)
        except:
            word_count_dict[w] = a.count(w)
#         print(w,a.count(w))    
    word_num += len(a)

print(word_count_dict, word_num)

({u'fun': 3, u'fly': 1, u'couple': 2, u'love': 2, u'fast': 1}, 9)


In [27]:
set(["couple", "fly", "fast", "fun", "fun"])
set(["fun", "couple", "love", "love"])

{'couple', 'fast', 'fly', 'fun'}

### Count를 했으니 DF로

In [61]:
term_df = pd.DataFrame()
term_df["word"] = word_count_dict.keys()
term_df["count"] = word_count_dict.values()
term_df["P_w_C"] = term_df["count"]/word_num
term_df.head()

Unnamed: 0,word,count,P_w_C
0,fun,3,0.333333
1,fly,1,0.111111
2,couple,2,0.222222
3,love,2,0.222222
4,fast,1,0.111111


---

## 2. 함수화

### 2.1. $P(Comedy)$, $P(Action)$ 이진 분류

In [109]:
import pandas as pd

review = pd.read_csv("data/steam_GTA5_review.csv", encoding="utf8")
review.head()

Unnamed: 0,rev_date,rev_text,rev_helpful,rev_games,rev_recommend
0,24 January,Things I Have Never Done in GTA Online Used Ha...,3 039 of 3 280 people 93 found this re...,133.0,Not Recommended
1,28 February,600 hours Great game Came back on to have some...,1 347 of 1 465 people 92 found this re...,163.0,Not Recommended
2,23 January,Banned in the middle of a game for no reason w...,806 of 889 people 91 found this review...,38.0,Not Recommended
3,30 August 2016,Let me start at the beginning in order to accu...,1 548 of 1 699 people 91 found this re...,87.0,Not Recommended
4,21 January,I used to love this game so much I ve played m...,874 of 965 people 91 found this review...,81.0,Not Recommended


In [73]:
def Prob_Class(data, Class_colName):
    rowNum = data.shape[0] 
    Class_set = set(data[Class_colName])
    Class_total_dict = {}
    
    for class_elem in Class_set:
        Class_total_dict[class_elem] = (sum(data[Class_colName] == class_elem))/rowNum
    return Class_total_dict

Prob_Class(review, "Rec").keys()

[u'Action', u'Comedy']

### 2.2. $P(word|Comedy)$, $P(word|Action)$

In [95]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk.data
import re

nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_wordlist(review, remove_stopwords = False):
    # review_text = re.sub("Review ", "", review)
    review_text = BeautifulSoup(review).get_text() # 텍스트 추출
    review_text = re.sub(" [^a-zA-Z]", " ", review_text) # 기호문자 제거
    review_text = re.sub(',','', review_text)
    words = review_text.lower().split() # 소문자 변환 후 분리
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

# 토크나이즈 함수생성(리뷰->토큰)
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    # 1. 문단을 문장으로 스플릿
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. 문장들을 빈리스트에 하나씩 추가
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0: # 비어있는 문장은 스킵
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences # (each sentence is a list of words)

def train_Baysian(data, Text_colName, Class_colName, Class_dict):
    
    ### 텍스트 데이터 전처리 
    sentences = []
    for d in data[Text_colName]:
        sentences += review_to_sentences(d, tokenizer)
        
    ### ㅁㅁ
    
    for class_elem in Class_dict.keys():
        
        Class_df = data[data[Class_colName] == class_elem].copy()
        word_num, word_count_dict = 0, {}
        
        for i in Class_df.index:
            a = sentences[i]

            for w in set(a):
                try:
                    word_count_dict[w] += a.count(w)
                except:
                    word_count_dict[w] = a.count(w)
        #         print(w,a.count(w))    
            word_num += len(a)
        print(word_count_dict, word_num)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [98]:
# train_Baysian(data = review, 
#               Text_colName = "word", 
#               Class_colName = "Rec", 
#               Class_dict = Prob_Class(review, "Rec"))

train_Baysian(data = review, 
              Text_colName = "rev_text", 
              Class_colName = "rev_recommend", 
              Class_dict = Prob_Class(review, "rev_recommend"))


AttributeError: 'float' object has no attribute 'strip'

In [78]:
for class_elem in Prob_Class(review, "Rec").keys():
    print(class_elem)

Action
Comedy
