In [1]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=300, window=2, min_count=1, workers=4)

In [2]:
from gensim.test.utils import get_tmpfile
fname = "doc2vec_model"
model.save(fname)
model = Doc2Vec.load(fname)

In [3]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()


columns = ('id',
           'title',
           'category',
           'story')

train = pd.read_csv('../../data/NewsAggregatorDataset/train.feature.txt',
                    names=columns, sep='\t')
valid = pd.read_csv('../../data/NewsAggregatorDataset/valid.feature.txt',
                    names=columns, sep='\t')
test = pd.read_csv('../../data/NewsAggregatorDataset/test.feature.txt',
                   names=columns, sep='\t')

  from pandas import Panel


In [66]:
import re
from sklearn.metrics import accuracy_score

def tokenize(doc):
    doc = re.sub(r"[',.]", '', doc)  # 記号を削除
    tokens = doc.split(' ')
    tokens = [token.lower() for token in tokens]  # 小文字に統一
    return tokens

def preprocessor(tokens):
    tokens = [token for token in tokens if token in vocab]
    return tokens
    
def doc2vec(doc):
    vector = model.infer_vector(doc)
    vector = np.multiply(vector, 10000)
    return pd.Series(vector)

def bag_of_words(doc):
    vector = [0]*len(vocab)
    for word in doc:
        if word in vocab:
            vector[vocab.index(word)] += 1
    return pd.Series(vector)

def accuracy(predict, y):
    return (predict == y).mean()

In [32]:
from collections import Counter
train['tokens'] = train.title.apply(tokenize)
vocab = train['tokens'].tolist()
vocab = sum(vocab, [])  # flat list
counter = Counter(vocab)
vocab = [
    token
    for token, freq in counter.most_common()
    if 2 < freq < 300
]

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
train['tokens'] = train.tokens.progress_apply(preprocessor)
X_train = train.tokens.progress_apply(bag_of_words) # 説明変数
Y_train = train['category'].map({'b': 0, 't': 1, 'e': 2, 'm': 3}) # クラスを定義
lr = LogisticRegression(class_weight='balanced') # ロジスティック回帰モデルのインスタンスを作成
lr.fit(X_train, Y_train) # ロジスティック回帰モデルの重みを学習

100%|██████████| 10672/10672 [00:01<00:00, 6636.59it/s]
100%|██████████| 10672/10672 [00:06<00:00, 1593.02it/s]


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
# print("coefficient = ", lr.coef_)
print("intercept = ", lr.intercept_)

intercept =  [ 0.35953993 -0.0887583   0.42194464 -0.69272628]


In [68]:
#  訓練データで予測
Y_pred = lr.predict(X_train)
Y_train = train['category'].map({'b': 0, 't': 1, 'e': 2, 'm': 3})
print(Y_pred[:10])
print(Y_train.head(10).tolist())

[3 0 2 2 0 2 1 1 2 2]
[3, 0, 2, 2, 0, 0, 0, 1, 2, 0]


In [69]:
print('訓練データ', accuracy(Y_pred, Y_train))

訓練データ 0.325712143928036


In [70]:
# 評価データで予測
test['tokens'] = test.title.apply(tokenize)
test['tokens'] = test.tokens.progress_apply(preprocessor)
X_test = test.tokens.progress_apply(doc2vec)  # 入力
Y_pred = lr.predict(X_test)  # 予測
Y_test = test['category'].map({'b': 0, 't': 1, 'e': 2, 'm': 3})  # 正解
print('predict', Y_pred[:10])
print('correct answer', Y_test.head(10).tolist())

100%|██████████| 1334/1334 [00:00<00:00, 2057.06it/s]
100%|██████████| 1334/1334 [00:00<00:00, 1726.15it/s]

predict [1 2 2 2 2 0 1 0 0 0]
correct answer [1, 2, 0, 1, 2, 2, 3, 0, 0, 2]





In [71]:
print('評価データ', accuracy(Y_pred, Y_test))

評価データ 0.24287856071964017
