In [62]:
import gensim
import nltk
from nltk.corpus import stopwords
import re

In [63]:
def review_to_wordlist(review, remove_stopwords=False):
    # review = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review)

    words = review_text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # print(words)
    return(words)

In [64]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    '''
    1. 将评论文章，按照句子段落来切分(所以会比文章的数量多很多)
    2. 返回句子列表，每个句子由一堆词组成
    '''
    review = BeautifulSoup(review, "html.parser").get_text()
    # raw_sentences 句子段落集合
    raw_sentences = tokenizer.tokenize(review)
    # print(raw_sentences)
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            # 获取句子中的词列表
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [65]:
import pandas as pd
root_dir = "C:/Users/andea/Desktop/STARed-Kernel/github上找的kaggle实例"
train = pd.read_csv('%s/%s' % (root_dir, 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
test = pd.read_csv('%s/%s' % (root_dir, 'testData.tsv'), header=0, delimiter="\t", quoting=3)

In [66]:
import nltk.data
import nltk
from nltk.corpus import stopwords
tokenizer = nltk.data.load(r'tokenizers\\punkt\\english.pickle',encoding='utf-8')


In [67]:
from bs4 import BeautifulSoup
sentences = []
for i, review in enumerate(train["review"]):
    # print(i, review)
    sentences += review_to_sentences(review, tokenizer, True)

In [68]:
import numpy as np

In [69]:
print(np.shape(train["review"]))
print(np.shape(sentences))

(25000,)
(267188,)


In [70]:
import time
from gensim.models import Word2Vec
# 模型参数
num_features = 300    # 词向量维度                    
min_word_count = 40   # Minimum word count                        
num_workers = 4       # 并行线程数
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [71]:
%%time
# 训练模型
print("训练模型中...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count=min_word_count, \
            window=context, sample=downsampling)
print("训练完成")

训练模型中...
训练完成
Wall time: 24.9 s


In [72]:
print('保存模型...')
model.init_sims(replace=True)
model_name = "%s/%s" % (root_dir, "300features_40minwords_10context")
model.save(model_name)
print('保存结束')

保存模型...
保存结束


In [73]:
from sklearn.model_selection import cross_val_score

In [74]:
def makeFeatureVec(words, model, num_features):
    '''
    对段落中的所有词向量进行取平均操作
    '''
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0.

    # Index2word包含了词表中的所有词，为了检索速度，保存到set中
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])

    # 取平均
    featureVec = np.divide(featureVec, nwords)
    return featureVec


In [75]:
#给定一个文本列表，每个文本由一个词列表组成，返回每个文本的词向量平均值
def getAvgFeatureVecs(reviews, model, num_features):
    '''
    给定一个文本列表，每个文本由一个词列表组成，返回每个文本的词向量平均值
    '''
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        if counter % 5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))

        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1

    return reviewFeatureVecs

In [76]:
# 预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
    train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
    test_data.append(' '.join(review_to_wordlist(test['review'][i])))

# 预览数据
print(train_data[0], '\n')
print(test_data[0])

with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay br br visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him br br the actual feature film bit when it finally starts is only on for m

In [77]:
%time 
trainDataVecs = getAvgFeatureVecs(train_data, model, num_features)
print(np.shape(trainDataVecs))

Wall time: 0 ns
Review 0 of 25000


  featureVec = np.add(featureVec, model[word])


Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
(25000, 300)


In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
model_LR = LogisticRegression(C=1.0) # (alpha=1.0, class_prior=None, fit_prior=True)
# 为了在预测的时候使用

In [80]:
model_LR.fit(trainDataVecs,label)

LogisticRegression()

In [None]:
%time testDataVecs = getAvgFeatureVecs(test_data, model, num_features)
print(np.shape(testDataVecs))

Review 0 of 25000


  featureVec = np.add(featureVec, model[word])


In [None]:
result = model_LR.predict( testDataVecs )

In [None]:
print(result)