## ナイーブベイズ法 Naive Bayes


### Basic Libraries

In [14]:
# coding: utf-8
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import MeCab
%matplotlib inline

### Libraries for Machine Learning

In [58]:
from sklearn import naive_bayes
from sklearn import metrics, model_selection
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
mecab = MeCab.Tagger()

text = '解析したいテキストです'
mecab.parse('')
node = mecab.parseToNode(text)
while node:
    word = node.surface
    pos = node.feature.split(',')[1]
    print('{0}, {1}'.format(word, pos))
    node = node.next

, *
解析, サ変接続
し, 自立
たい, *
テキスト, 一般
です, *
, *


### 3. Preprocessing Datasets
分かち書きファイルの生成

In [28]:
import glob

def get_data_from_path(filepath):
    with open(filepath) as f:
        return f.read()

def write_data(filepath, data):
    f = open(filepath, 'w')
    f.write(data)
    f.close()

def get_wakati(text):
    tagger = MeCab.Tagger ("-Ochasen")
    txt = tagger.parse(text)
    lines = txt.split('\n')
    rst = ''
    for line in lines:
        tmp = line.split('\t')
        if len(tmp) <= 4: continue
        h = tmp[3]
        if h.startswith('名詞') or h.startswith('動詞') or h.startswith('形容詞'):
            rst += tmp[2] + ' '
    return rst

categories = glob.glob('./text/*')
categories_wakati_path = './wakati/'
for ctg in categories:
    ctg_name = ctg.split('/')[-1]
    
    texts = glob.glob(ctg + '/*')
    if len(texts) < 100: continue
    
    texts = texts[:100]
    
    ctg_text = ''
    for txt in texts:
        data = get_data_from_path(txt)
        ctg_text += get_wakati(data) + "\n\n"
    write_data(categories_wakati_path + ctg_name + '.txt', ctg_text)

Bag-of-Words ベクトルの作成

In [65]:
corpus = []
y = []

categories = glob.glob('./wakati/*')
for ctg in categories:
    ctg_name = ctg.split('/')[-1][:-4]
    ctg_data = get_data_from_path(ctg)
    articles = ctg_data.split("\n\n")
    corpus.extend(articles)
    y.extend([ctg_name for i in range(len(articles))])

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
X = X.toarray()
print(X.shape)
print(len(y))

(909, 30436)
909


トレーニングデータとテストデータに分割

In [66]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X,y, test_size=0.5, random_state=0)

### 4. Training Model
GaussianNB() => 0.68
MultinomialNB() => 0.82

In [71]:
clf = naive_bayes.MultinomialNB(alpha=0.1, fit_prior='True' )
clf.fit(X_train, Y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior='True')

### 5. Evaluate Model

In [73]:
clf.score(X_test, Y_test)

0.8241758241758241

### sklearn.naive_bayes.MultinomialNB
#### parameters
- alpha: ラプラススムージングの係数
- fit_prior: 以前学習した結果を維持するか
- class_prior: クラスの事前確率

#### attributes
