# 総合添削問題

#### 問題

- ランダムフォレスト、ナイーブベイズ分析を用いてlive door newsコーパスの精度を調べてください。
- また、livedoor newsコーパスの名詞、動詞、形容詞、形容動詞のみ取り出した時の精度を同様に調べてください。

In [54]:
%%time

import glob
import pandas as pd
import numpy as np
from janome.tokenizer import Tokenizer
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# livedoor newsの読み込みと分類
def load_livedoor_news_corpus(base_dir):
    category = {
#         "dokujo-tsushin": 1,
       "it-life-hack":2,
#         "kaden-channel": 3,
#         "livedoor-homme": 4,
#         "movie-enter": 5,
#         "peachy": 6,
#         "smax": 7,
#         "sports-watch": 8,
        "topic-news":9
    }
    docs  = []
    labels = []

    for c_name, c_id in category.items():
        # ./text/dokujo-tsushin/dokujo-tsushin-{6903790}.txt
        files = glob.glob("./{base_dir}/{c_name}/{c_name}*.txt".format(base_dir=base_dir, c_name=c_name))

        text = ""
        for file in files:
            with open(file, "r", encoding="utf-8") as f:
                lines = f.read().splitlines() 
                # 件名(3行目)、本文(4行目以降)を取得
                subject = lines[2]
                body = "".join(lines[3:])
                text = subject + body

            docs.append(text)
            labels.append(c_id)

    return docs, labels

def tokenize(text, filter_speech = None):
    t = Tokenizer()
    tokens = t.tokenize(",".join(text))
    result = []
    for token in tokens:
        part_of_speech = token.part_of_speech.split(",")[0]

        if filter_speech == None or part_of_speech in filter_speech: 
            result.append(token.surface)
    return result

def validate_score(train_doc_X, test_doc_X, train_y, test_y):
    docs_words = tokenize(train_doc_X)
#    docs_model = word2vec.Word2Vec(docs_words, size=10, min_count=3, window=1)
    vectorizer = TfidfVectorizer()
    train_X = vectorizer.fit_transform(train_doc_X)
    test_X = vectorizer.transform(test_doc_X)
    f1_score_rf = classify_random_forest(train_X, test_X, train_y, test_y)
    f1_score_nb = classify_naive_bayes(train_X, test_X, train_y, test_y)
    print("精度 RandomForest={} Naive Bayes={}".format(f1_score_rf, f1_score_nb))

def classify_random_forest(train_X, test_X, train_y, test_y):
    model = RandomForestClassifier(n_estimators=2)
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    return f1_score(test_y, pred_y, average="micro")

def classify_naive_bayes(train_X, test_X, train_y, test_y):
    model = GaussianNB()
    print  (type(train_X.todense()))
    print  (type(train_X))
    model.fit(train_X.todense(), train_y)
    pred_y = model.predict(test_X.todense())
    return f1_score(test_y, pred_y, average="micro")

def validate_score_filter_speech(train_doc_X, test_doc_X, train_y, test_y):
    docs_words = tokenize(train_doc_X, filter_speech = ["名詞", "形容詞", "動詞", "形容動詞"])
    vectorizer = TfidfVectorizer()
    train_X = vectorizer.fit_transform(train_doc_X)
    test_X = vectorizer.transform(test_doc_X)
    
    score_rf = classify_random_forest(train_X, test_X, train_y, test_y)
    score_nb = classify_naive_bayes(train_X, test_X, train_y, test_y)
    print("名詞、動詞、形容詞、形容動詞で絞り込んだ場合の精度 RandomForest={} Naive Bayes={}".format(score_rf, score_nb))

docs, labels = load_livedoor_news_corpus("text")
train_X, test_X, train_y, test_y = train_test_split(docs, labels, test_size=0.3, random_state=int(time.time()))

validate_score(train_X, test_X, train_y, test_y)

# 名詞、動詞、形容詞、形容動詞で絞り込んだ場合
validate_score_filter_speech(train_X, test_X, train_y, test_y)

<class 'numpy.matrixlib.defmatrix.matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
精度 RandomForest=0.8475609756097561 Naive Bayes=0.9512195121951219
<class 'numpy.matrixlib.defmatrix.matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
名詞、動詞、形容詞、形容動詞で絞り込んだ場合の精度 RandomForest=0.8902439024390244 Naive Bayes=0.9512195121951219
CPU times: user 3min 58s, sys: 6.96 s, total: 4min 5s
Wall time: 4min 6s


##  解答例