In [156]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=300, window=2, min_count=1, workers=4)

In [157]:
from gensim.test.utils import get_tmpfile
fname = "doc2vec_model"
model.save(fname)
model = Doc2Vec.load(fname)

In [158]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [188]:
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
tqdm.pandas()


columns = ('id',
           'title',
           'category',
           'story')

train = pd.read_csv('../../data/NewsAggregatorDataset/train.feature.txt',
                    names=columns, sep='\t')
valid = pd.read_csv('../../data/NewsAggregatorDataset/valid.feature.txt',
                    names=columns, sep='\t')
test = pd.read_csv('../../data/NewsAggregatorDataset/test.feature.txt',
                   names=columns, sep='\t')

def tokenize(doc):
    doc = re.sub(r"[',.]", '', doc)  # 記号を削除
    tokens = doc.split(' ')
    tokens = [token.lower() for token in tokens]  # 小文字に統一
    return tokens

def preprocessor(tokens):
    tokens = [token for token in tokens if token not in common_words]
    return tokens
    
def get_vector(doc):
    
    vector = model.infer_vector(doc)
    vector = np.multiply(vector, 10000)
    return pd.Series(vector)

  from pandas import Panel


In [197]:
from collections import Counter
train['tokens'] = train.title.apply(tokenize)
vocab = train['tokens'].tolist()
vocab = sum(vocab, [])  # flat list
counter = Counter(vocab)
vocab = [
    token
    for token, freq in counter.most_common()
    if 2 < freq < 300
]

In [199]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
train['tokens'] = train.tokens.progress_apply(preprocessor)
X_train = train.tokens.progress_apply(get_vector) # 説明変数
Y_train = train['category'].map({'b': 0, 't': 1, 'e': 2, 'm': 3}) # クラスを定義
lr = LogisticRegression() # ロジスティック回帰モデルのインスタンスを作成
lr.fit(X_train, Y_train) # ロジスティック回帰モデルの重みを学習

100%|██████████| 10672/10672 [00:32<00:00, 324.32it/s]
100%|██████████| 10672/10672 [00:06<00:00, 1707.93it/s]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [200]:
# print("coefficient = ", lr.coef_)
print("intercept = ", lr.intercept_)

intercept =  [ 0.87050237 -0.53406917  0.801551   -1.1379842 ]


In [201]:
test['tokens'] = test.title.apply(tokenize)
test['tokens'] = test.tokens.progress_apply(preprocessor)
X_test = train.tokens.progress_apply(get_vector)
Y_pred = lr.predict(X_test)
print(Y_pred[:100])
print(test['category'].head(100).tolist())

100%|██████████| 1334/1334 [00:04<00:00, 316.38it/s]
100%|██████████| 10672/10672 [00:06<00:00, 1768.60it/s]

[2 2 0 0 0 2 0 2 0 0 0 0 0 0 0 0 2 0 2 0 0 0 2 0 2 0 0 0 2 0 0 0 2 0 2 0 2
 2 0 2 2 2 0 2 0 0 0 2 0 0 0 0 2 2 2 0 0 0 0 0 2 0 2 2 2 0 0 0 0 2 2 0 2 2
 2 0 2 2 0 0 0 2 0 0 2 0 0 2 2 0 2 2 2 2 0 0 2 0 0 0]
['t', 'e', 'b', 't', 'e', 'e', 'm', 'b', 'b', 'e', 'b', 'm', 'b', 'e', 't', 'e', 'b', 't', 'e', 'b', 'm', 't', 'e', 't', 'e', 'e', 'b', 'e', 'e', 'e', 't', 't', 'b', 'e', 'b', 'e', 'e', 'b', 'b', 'm', 'e', 'e', 'b', 'b', 'b', 'e', 't', 'b', 'e', 'e', 'b', 'b', 'e', 't', 'e', 'e', 'b', 'b', 'b', 'b', 'b', 'b', 'e', 'b', 'b', 'e', 't', 'b', 't', 'b', 'e', 'b', 'e', 'b', 'e', 'e', 'e', 'e', 'e', 't', 'e', 'e', 'e', 'm', 'm', 'b', 'b', 'b', 'e', 'b', 'b', 'b', 'e', 'b', 'e', 'b', 't', 't', 't', 'e']





In [198]:

len(vocab)

5762

In [194]:
train['category'].head(10)

0    m
1    b
2    e
3    e
4    b
5    b
6    b
7    t
8    e
9    b
Name: category, dtype: object