In [1]:
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
import jieba
from gensim.models import word2vec
import logging



In [2]:
#读取数据
def get_data():
	comment_data=pd.read_excel('评论数据.xls')
	return comment_data.loc[:,['hotelid','content']]

In [3]:
#建立中文停用词表 返回list格式
def get_stop_words():
	chinese_stop_words=[]
	for line in open("chinese_stop_words.txt",'r'):
		chinese_stop_words.append(line[:-1])
	return chinese_stop_words

In [4]:
#分词，以list形式返回结果
def segment_word(comment_data,chinese_stop_words):
	sentences=[]
	for comment in comment_data:
		seg_word=list(jieba.cut(str(comment)))
		keywords_list=[]
		for s_w in seg_word:
			if s_w not in chinese_stop_words:  
				if s_w != '\t':
					keywords_list.append(s_w)
		sentences.append(keywords_list)
	return sentences

In [5]:
# 计算某个词的相关词列表（包括正面与负面）
def related_list(model,word):
    items1 = model.most_similar(positive=[word],topn=20)
    items2 = model.most_similar(negative=[word],topn=20)
    df1 = pd.DataFrame(items1, columns=['词', '相似度'])
    df2 = pd.DataFrame(items2, columns=['词', '相似度'])
    df=pd.concat([df1,df2])
    df = df.reindex(columns=['原词','词', '相似度'],fill_value=word)
    return df

In [6]:
# 生成酒店相关评价列表
def related_list_all(model):
    key_words=['服务','房间','环境','位置','交通','性价比','早餐','价格']
    df=pd.DataFrame(columns=['原词','词', '相似度'])
    for word in key_words:
        df_sub=related_list(model,word)
        df=pd.concat([df,df_sub])
    return df

In [7]:
# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
comment_data=get_data()
chinese_stop_words=get_stop_words()
sentences=segment_word(comment_data.content,chinese_stop_words)
model = word2vec.Word2Vec(sentences, size=2048, min_count=10) # 默认window=5

Building prefix dict from the default dictionary ...
2018-11-16 11:06:51,406 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
2018-11-16 11:06:51,409 : DEBUG : Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.723 seconds.
2018-11-16 11:06:52,131 : DEBUG : Loading model cost 0.723 seconds.
Prefix dict has been built succesfully.
2018-11-16 11:06:52,133 : DEBUG : Prefix dict has been built succesfully.
2018-11-16 11:07:03,898 : INFO : collecting all words and their counts
2018-11-16 11:07:03,899 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-16 11:07:03,920 : INFO : PROGRESS: at sentence #10000, processed 105225 words, keeping 10092 word types
2018-11-16 11:07:03,944 : INFO : collected 16423 word types from a corpus of 213741 raw words and 17909 sentences
2018-11-16 11:07:03,945 : INFO : Loading a fresh vocabulary
2018-1

In [9]:
df=related_list_all(model)
df.to_csv('所有酒店相似度分析.csv',encoding="utf_8_sig")

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [8]:
# 寻找不合群的词
y2 = model.doesnt_match(u"酒店 服务 很".split())
y2

  
2018-11-16 11:07:06,851 : INFO : precomputing L2-norms of word weight vectors


'服务'

In [10]:
# 保存模型，以便重用
#model.save(u"酒店评论.model")
# # 对应的加载方式
#model = word2vec.Word2Vec.load("酒店评论.model")