In [23]:
# モジュール読み込み
import csv
import pickle

import gensim
from gensim import models
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [24]:
dataset_file_name = './dataset/corpus.csv'
model_file_name = 'logistic.pkl'
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/IA/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
def make_dataset(dataset_file_name):
    """
    csvファイルを扱いやすいフォーマットに変形
    """
    with open(dataset_file_name, 'r') as f:
        data = list(csv.reader(f))
    texts, label_ids = [], []
    label2id = {}
    idx_label, idx_sentence = 0, 1
    sum_words = []
    for counter, row in enumerate(data):
        if counter == 0:
            continue
        label = row[idx_label]
        if label not in label2id:
            label2id[label] = len(label2id)
        label_ids.append(label2id[label])
        word_list = nltk.word_tokenize(row[idx_sentence])
        texts.append(word_list)
        for line in word_list:
            for word in line:
                sum_words.append(word)
    print(word_list, word)
    print('sum_words: ', len(sum_words))
    id2label = {v: k for k, v in label2id.items()}
    return texts, label_ids, id2label, label2id

texts, label_ids, id2label, label2id = make_dataset(dataset_file_name)

['実に', '大浦', '武士', '冥々', '裡', '照覧', '給う', '擦ら'] ら
sum_words:  102532


In [26]:
def make_bow_model(texts):
    """
    テキストのベクトル表現を作成
    """
    # 頻出語の辞書を作成
    dictionary = gensim.corpora.Dictionary(texts)
    # 辞書を用いてBoW形式に文章を行列化
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    print(len(corpus))
    
    # BoW形式で作成したcorpusをtfidfを用いて重み付け
    tfidf_model = models.TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    num_words = len(dictionary)
    dense_corpus = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=num_words).T
    print('dense_corpus: ', dense_corpus)
    return dense_corpus, dictionary, num_words

dense_corpus, dictionary, num_words = make_bow_model(texts)

8875
dense_corpus:  [[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.50336647  0.34863934  0.35234514 ...,  0.          0.          0.        ]
 [ 0.          0.          0.37041566 ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.51831585  0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.63854194  0.        ]
 [ 0.          0.          0.         ...,  0.          0.38578594
   0.4176282 ]]


In [29]:
def train(dense_corpus, label_ids, id2label):
    """
    クロスバリデーションかけて訓練データとテストデータの作成
    """
    X_train, X_test, y_train, y_test = train_test_split(dense_corpus, label_ids, test_size=0.2, random_state=42)

    clf = LogisticRegression(C=1, penalty='l2')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    target_names = list(id2label.values())
    print(classification_report(y_test, y_pred, target_names=target_names))
    print(confusion_matrix(y_test, y_pred))
    return clf

clf = train(dense_corpus, label_ids, id2label)

             precision    recall  f1-score   support

      dazai       0.69      0.96      0.80       732
       mori       0.95      0.62      0.75       492
  akutagawa       0.87      0.68      0.77       551

avg / total       0.82      0.78      0.78      1775

[[704   7  21]
 [152 307  33]
 [166   9 376]]


In [30]:
# pickleにしてモデルを保存 
with open('logistic.pkl', 'wb') as f:
    pickle.dump((clf, dictionary, num_words, id2label, label2id), f)