# Part 2.7 Doc2vec

在笔记2.5里，训练doc2vec的时候，我选了100维，最后分类器用了Logistic Regression。不过作者在[sentiment-analysis](https://github.com/pangolulu/sentiment-analysis)中，训练的是200维的向量，分类器用了SVM，及RBF核。

我也尝试使用这样的设置。
......
最后结果是0.868。比LR的效果差了一点。
维度增加了，还用了强分类器，结果还不如100维下的LR好……

我再试一试200维下的LR效果如何。
得分是0.866，没想象得好。看来100维就够了，200维可能过拟合了。

In [2]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument


def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join(meaningful_words))


def tag_reviews(reviews, prefix):
    tagged = []
    for i, review in enumerate(reviews):
        tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % i]))
    return tagged

In [5]:
# gensim modules
from gensim.models import Doc2Vec

# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression

# random
from random import shuffle

# preprocess packages
import pandas as pd
# import sys
# sys.path.insert(0, '..')
# from utils.TextPreprocess import review_to_words, tag_reviews


'''
Training Data
'''
train = pd.read_csv("../Sentiment/data/labeledTrainData.tsv", header=0, 
                         delimiter='\t', quoting=3, error_bad_lines=False)
num_reviews = train["review"].size

print("Cleaning and parsing the training set movie reviews...")
clean_train_reviews = []
for i in range(0, num_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))

'''
Test Data
'''
test = pd.read_csv("../Sentiment/data/testData.tsv", header = 0, delimiter = "\t", quoting = 3)

num_reviews = len(test["review"])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)


# Unlabeled Train Data
unlabeled_reviews = pd.read_csv("../Sentiment/data/unlabeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)
num_reviews = len(unlabeled_reviews["review"])
clean_unlabeled_reviews = []

print("Cleaning and parsing the test set movie reviews...")
for i in range( 0, num_reviews):
    if( (i+1)%5000 == 0 ):
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words(unlabeled_reviews["review"][i])
    clean_unlabeled_reviews.append(clean_review)

Cleaning and parsing the training set movie reviews...
Cleaning and parsing the test set movie reviews...
Cleaning and parsing the test set movie reviews...
Review 5000 of 50000

Review 10000 of 50000

Review 15000 of 50000

Review 20000 of 50000

Review 25000 of 50000

Review 30000 of 50000

Review 35000 of 50000

Review 40000 of 50000

Review 45000 of 50000

Review 50000 of 50000



In [6]:
# tag all reviews
train_tagged = tag_reviews(clean_train_reviews, 'TRAIN')
test_tagged = tag_reviews(clean_test_reviews, 'TEST')
unlabeled_train_tagged = tag_reviews(clean_unlabeled_reviews, 'UNTRAIN')

In [7]:
# model construction
model_dbow = Doc2Vec(min_count=1, window=10, size=200, sample=1e-3, negative=5, dm=0, workers=3)

# build vocabulary
all_tagged = []
tag_objects = [train_tagged, test_tagged, unlabeled_train_tagged]
for tag_object in tag_objects:
    for tag in tag_object:
        all_tagged.append(tag)

model_dbow.build_vocab(all_tagged)

# train two model
train_tagged2 = []
tag_objects = [train_tagged, unlabeled_train_tagged]
for tag_object in tag_objects:
    for tag in tag_object:
        train_tagged2.append(tag)

for i in range(10):
    shuffle(train_tagged2)
    model_dbow.train(train_tagged2, total_examples=len(train_tagged2), epochs=1, start_alpha=0.025, end_alpha=0.025)


train_array_dbow = []
for i in range(len(train_tagged)):
    tag = train_tagged[i].tags[0]
    train_array_dbow.append(model_dbow.docvecs[tag])

train_target = train['sentiment'].values

test_array_dbow = []
for i in range(len(test_tagged)):
    test_array_dbow.append(model_dbow.infer_vector(test_tagged[i].words))


In [8]:
from sklearn.svm import SVC

# classification model
clf = SVC(C=1.0, kernel='rbf')

# train
clf.fit(train_array_dbow, train_target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
# predict
result = clf.predict(test_array_dbow)

# output
print("output...")
output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
output.to_csv('doc2vec_svm.csv', index=False, quoting=3)

output...


In [10]:
from sklearn.linear_model import LogisticRegression

lr_dbow = LogisticRegression()
lr_dbow.fit(train_array_dbow, train_target)
result_dbow = lr_dbow.predict(test_array_dbow)

output_dbow = pd.DataFrame(data={'id': test['id'], 'sentiment': result_dbow})
output_dbow.to_csv('doc2vec_dbow200.csv', index=False, quoting=3)