In [156]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=300, window=2, min_count=1, workers=4)

In [157]:
from gensim.test.utils import get_tmpfile
fname = "doc2vec_model"
model.save(fname)
model = Doc2Vec.load(fname)

In [158]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [182]:
import pandas as pd
import re
import numpy as np
columns = ('id',
           'title',
           'category',
           'story')

train = pd.read_csv('../../data/NewsAggregatorDataset/train.feature.txt',
                    names=columns, sep='\t')
valid = pd.read_csv('../../data/NewsAggregatorDataset/valid.feature.txt',
                    names=columns, sep='\t')
test = pd.read_csv('../../data/NewsAggregatorDataset/test.feature.txt',
                   names=columns, sep='\t')

def tokenize(doc):
    doc = re.sub(r"[',.]", '', doc)  # 記号を削除
    tokens = doc.split(' ')
    tokens = [token.lower() for token in tokens]  # 小文字に統一
    return tokens

def preprocessor(tokens):
    tokens = [token for token in tokens if token not in common_words]
    return tokens
    
def get_vector(doc):
    
    vector = model.infer_vector(doc)
    vector = np.multiply(vector, 10000)
    return pd.Series(vector)

In [183]:
from collections import Counter
train['tokens'] = train.title.apply(tokenize)
tokens = train['tokens'].tolist()
tokens = sum(tokens, [])  # flat list
common_words = Counter(tokens).most_common()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
train['tokens'] = train.tokens.apply(preprocessor)
X_train = train.tokens.apply(get_vector) # 説明変数
Y_train = train['category'].map({'b': 0, 't': 1, 'e': 2, 'm': 3}) # クラスを定義
lr = LogisticRegression() # ロジスティック回帰モデルのインスタンスを作成
lr.fit(X_train, Y_train) # ロジスティック回帰モデルの重みを学習

In [177]:
# print("coefficient = ", lr.coef_)
print("intercept = ", lr.intercept_)

intercept =  [ 0.87675372 -0.51600799  0.8103182  -1.17106392]


In [178]:
test['tokens'] = test.title.apply(tokenize)
test['tokens'] = test.tokens.apply(preprocessor)
X_test = train.tokens.apply(get_vector)
Y_pred = lr.predict(X_test)
print(Y_pred[:100])
print(test['category'].head(100).tolist())

[0 0 2 0 2 2 0 0 2 2 0 0 0 2 0 2 2 2 0 2 0 2 0 2 0 2 0 2 2 2 2 2 0 0 2 0 0
 2 2 0 2 0 0 2 0 0 2 0 0 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 0 2 2 2 2 0 2 0 0 2
 0 0 2 2 2 0 0 2 2 2 2 2 0 0 0 2 0 0 0 0 2 0 0 2 0 0]
['t', 'e', 'b', 't', 'e', 'e', 'm', 'b', 'b', 'e', 'b', 'm', 'b', 'e', 't', 'e', 'b', 't', 'e', 'b', 'm', 't', 'e', 't', 'e', 'e', 'b', 'e', 'e', 'e', 't', 't', 'b', 'e', 'b', 'e', 'e', 'b', 'b', 'm', 'e', 'e', 'b', 'b', 'b', 'e', 't', 'b', 'e', 'e', 'b', 'b', 'e', 't', 'e', 'e', 'b', 'b', 'b', 'b', 'b', 'b', 'e', 'b', 'b', 'e', 't', 'b', 't', 'b', 'e', 'b', 'e', 'b', 'e', 'e', 'e', 'e', 'e', 't', 'e', 'e', 'e', 'm', 'm', 'b', 'b', 'b', 'e', 'b', 'b', 'b', 'e', 'b', 'e', 'b', 't', 't', 't', 'e']


In [184]:
tokens

['mother',
 'left',
 'with',
 'gaping',
 'hole',
 'in',
 'cheek',
 'and',
 'no',
 'jawbone',
 'after',
 'kazakh',
 'doctors',
 '',
 '',
 'what',
 'the',
 'fox-time',
 'warner',
 'merger',
 'would',
 'mean',
 'for',
 'superheroes',
 'paul',
 'mccartney',
 'helps',
 'man',
 'propose',
 'to',
 'his',
 'girlfriend',
 'on',
 'stage',
 'during',
 'first',
 '',
 '',
 'girls',
 'gone',
 'wilds',
 'joe',
 'francis',
 'jailed',
 'and',
 'ordered',
 'to',
 'undertake',
 'anger',
 '',
 '',
 'candy',
 'crush',
 'maker',
 'king',
 'serves',
 'up',
 'bittersweet',
 'results',
 'shares',
 'fall',
 'department',
 'of',
 'motor',
 'vehicles',
 'in',
 'california',
 'investigating',
 'possible',
 'credit',
 'card',
 '',
 '',
 'total',
 'stopped',
 'buying',
 'novatek',
 'stock',
 'after',
 'jet',
 'downing',
 'amazon',
 'to',
 'buy',
 'twitch',
 'for',
 'about',
 '$1',
 'billion',
 'oprah',
 'chai:',
 'how',
 'she',
 'created',
 'her',
 'new',
 'tea',
 'with',
 'teavana',
 'and',
 'starbucks',
 '(video)'

In [98]:
train['category'].head(10)

0    m
1    b
2    e
3    e
4    b
5    b
6    b
7    t
8    e
9    b
Name: category, dtype: object