In [62]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer#词干提取
from nltk.stem import WordNetLemmatizer#词形还原
from nltk import sent_tokenize#英语句子切分

In [2]:
#导入模型
from gensim.models import Word2Vec
model = Word2Vec.load("model/300features_40minwords_10context")
model.wv.syn0.shape#词汇表，在word2vec中每个词的特征向量，词向量维度为之前定义的300，一共是10090个词

  after removing the cwd from sys.path.


(10090, 300)

In [None]:
model['queen'][:10]#取出一个词的词向量

In [None]:
model.wv.doesnt_match('chair desk bed apple'.split())

In [3]:
#导入数据
train_data=pd.read_csv('data/labeledTrainData.tsv',header=0, delimiter='\t', quoting=3)
test_data=pd.read_csv('data/testData.tsv',header=0, delimiter='\t', quoting=3)

## 1. 获得每个review的词向量+随机森林
一个文本的词向量=其中每个词的词向量取平均

In [4]:
def txt2wordlist(raw_txt, remove_stop):#把一个文本进行预处理并转成词的list
    l_stem=LancasterStemmer()#不能用PorterStemmer，对于一些未登录词会报错，比如OED
    w_lem=WordNetLemmatizer()
    txt=BeautifulSoup(raw_txt,'lxml')
    txt=txt.get_text().lower()
    txt=re.sub('[^a-zA-Z]',' ',txt)
    word=word_tokenize(txt)
    if remove_stop:
        stops=set(stopwords.words('english'))
        word=[w for w in word if w not in stops]
    word=[l_stem.stem(w) for w in word]
    word=[w_lem.lemmatize(w) for w in word]
    return word

In [5]:
import numpy as np
def Review_Vec(review_wordlist, model, num_feature):
    AFeatureVec=np.zeros((num_feature),dtype='float32')
    index2word_set = set(model.wv.index2word)
    nword=0
    for w in review_wordlist:
        if w in index2word_set:
            nword+=1
            AFeatureVec=np.add(AFeatureVec,model[w])
    AFeatureVec=np.divide(AFeatureVec,nword)
    return AFeatureVec
def Avg_Review_Vecs(reviews_wordlists,model, num_feature):
    ReviewFeatureVec=np.zeros((len(reviews_wordlists),num_feature),dtype='float32')
    for i in range(len(reviews_wordlists)):
        if(i%1000==0):
            print("%d review of %d"%(i,len(reviews_wordlists)))
        ReviewFeatureVec[i]=Review_Vec(reviews_wordlists[i],model,num_feature)
    return ReviewFeatureVec

In [6]:
#word2vec中的参数
num_worker=4#并行的线程数
num_feature=300#词向量的维度
num_min_count=40#最小词频，小于这个词频的就不纳入分析
num_window=10#上下文窗口大小
num_downsample=1e-3#高频词汇的随机降采样的配置阈值，默认为1e-3，范围是(0,1e-5)

In [18]:
clean_train_reviews=[txt2wordlist(review,True) for review in train_data['review']]

In [None]:
train_vecs=Avg_Review_Vecs(clean_train_reviews,model,num_feature)
train_vecs[:5]

In [19]:
clean_test_reviews=[txt2wordlist(review,True) for review in test_data['review']]

In [None]:
test_vecs=Avg_Review_Vecs(clean_test_reviews,model,num_feature)
test_vecs[:5]

### 使用随机森林(、极端随机森林、梯度提升)预测

In [None]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(train_vecs,train_data['sentiment'])
result=forest.predict(test_vecs)

### 创建提交文件

In [None]:
output = pd.DataFrame( data={"id":test_data["id"], "sentiment":result} )
output.to_csv( "data/result/Word2Vec_model.csv", index=False, quoting=3 )

## 2. 对review聚类
使用kmeans，对所有word2vec中的词进行聚类，使得每一个词都有一个对应的类然后对所有review使用kmeans聚类

In [9]:
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0]//10#设定词的聚类中心数目，=word2vec中词汇表的行数/5，即词的数目/5，
num_clusters

  """Entry point for launching an IPython kernel.


1009

In [10]:
import time
from sklearn.cluster import KMeans
start=time.time()#用以记录运行时间
kmeans=KMeans(n_clusters=num_clusters,n_jobs=-2)#n_jobs=-2：除了一个cpu之外的所有cpu都被使用
idx=kmeans.fit_predict(word_vectors)#返回各个词所被分配的类索引
end=time.time()
run_time=end-start
print("run time: %d"%(run_time))

run time: 90


In [11]:
idx.shape

(10090,)

In [None]:
idx[:5]

#### 将word2vec中每个词和上述生成的每个词的类别做成字典形式，从而可以直观的看到哪些词被分到了哪一类里

In [13]:
word_idx=dict(zip(model.wv.index2word,idx))

In [14]:
max(word_idx.values())

1008

In [None]:
for i in range(10):
    print("Cluster %d:"%(i))
    c=[w for w in model.wv.index2word if(word_idx[w]==i)]
    print(c)

#### 将每个评论转换成这1009个类别的词向量，从而得到一个25000X1009大小的矩阵

In [56]:
def Review_Class_Vec(review,model, word_idx):
    review_vec=np.zeros((num_clusters),dtype='float32')
    index2word_set = set(model.wv.index2word)
    for w in review:
        if w in index2word_set:
            review_vec[word_idx[w]]=np.add(review_vec[word_idx[w]],1)
    return review_vec

In [57]:
def Review_Class_Vecs(reviews,model, word_idx):
    review_vecs=np.zeros((len(reviews),num_clusters),dtype='float32')
    for i in range(len(reviews)):
        review_vecs[i]=Review_Class_Vec(reviews[i],model, word_idx)
    return review_vecs

In [58]:
train_class_vecs=Review_Class_Vecs(clean_train_reviews,model, word_idx)
test_class_vecs=Review_Class_Vecs(clean_test_reviews,model, word_idx)

### 使用随机森林(、极端随机森林、梯度提升)预测

In [63]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
#forest = RandomForestClassifier(n_estimators = 100) 
forest = GradientBoostingClassifier(n_estimators = 100) 
forest = forest.fit(train_class_vecs,train_data['sentiment'])
result=forest.predict(test_class_vecs)

### 创建提交文件

In [64]:
output = pd.DataFrame( data={"id":test_data["id"], "sentiment":result} )
#output.to_csv( "data/result/Word2Vec_Kmeans_model.csv", index=False, quoting=3 )
output.to_csv( "data/result/Word2Vec_Kmeans_GBC_model.csv", index=False, quoting=3 )