In [3]:
import jieba
from jieba import analyse
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
from wordcloud import WordCloud

plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
warnings.filterwarnings("ignore")


data = pd.read_csv("ChnSentiCorp_htl_all.csv")
data["review"] = data["review"].astype('str')


strinfo1 = re.compile('[0-9]|酒店|携程|年月日|北京|上海|重庆|广州|杭州|南京|成都|苏州|西安|东莞|长沙|济南|深圳|西路|东路')
data["review"] = data["review"].apply(lambda x: strinfo1.sub('', x))
# 由于有的时候jupyternotebook会出bug,去除一次并不能去除掉年月日，所以保险起见，再去除一次
strinfo2 = re.compile('[0-9]|酒店|携程|年月日')
data["review"] = data["review"].apply(lambda x: strinfo2.sub('', x))
# 第一步 将空字符的行替换为nan，方便进行删除
data.replace(to_replace=r'^\s*$', value=np.nan, regex=True, inplace=True)
data.replace(to_replace=r'[a-zA-Z]', value=np.nan, regex=True, inplace=True)
# 第二步 删除所有值为nan的行
data.dropna(axis=0, how='any', inplace=True)

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF  # 原始文本转化为tf-idf的特征矩阵
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
 
# 将有标签的数据集划分成训练集和测试集
train_X,valid_X,train_y,valid_y = train_test_split(data['review'],data['label'],test_size=0.3,random_state=42)
 
train_X.shape,train_y.shape,valid_X.shape,valid_y.shape


((4403,), (4403,), (1887,), (1887,))

((4403,), (4403,), (1887,), (1887,))

In [16]:
# 模型构建
model_tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1,3), use_idf=1, smooth_idf=1)
# 学习idf vector
model_tfidf.fit(train_X)
# 把文档转换成 X矩阵（该文档中该特征词出现的频次），行是文档个数，列是特征词的个数
train_vec = model_tfidf.transform(train_X)
 
# 模型训练
model_SVC = LinearSVC()
clf = CalibratedClassifierCV(model_SVC)
clf.fit(train_vec,train_y)

CalibratedClassifierCV(base_estimator=LinearSVC())

CalibratedClassifierCV(base_estimator=LinearSVC())

In [17]:
# 把文档转换成矩阵
valid_vec = model_tfidf.transform(valid_X)
# 验证
pre_valid = clf.predict_proba(valid_vec)
 
pre_valid = clf.predict(valid_vec)
print('正例:',sum(pre_valid == 1))
print('负例:',sum(pre_valid == 0))

正例: 1686
负例: 201


正例: 1686
负例: 201

In [18]:
from sklearn.metrics import accuracy_score
 
score = accuracy_score(pre_valid,valid_y)
print("准确率:",score)

准确率: 0.7270800211976682


准确率: 0.7270800211976682

In [19]:
def get_balanced_words(size,
                       positive_comment=data[data['label'] == 1],
                       negtive_comment=data[data['label'] == 0]):
    word_size = size // 2
    #获取正负评论数
    num_pos = positive_comment.shape[0]
    num_neg = negtive_comment.shape[0]
    #     当 正(负)品论数中<采样数量/2 时，进行上采样，否则都是下采样；
    #     其中pandas的sample方法里的repalce参数代表是否进行上采样，默认不进行
    balanced_words = pd.concat([
        positive_comment.sample(word_size,
                                replace=num_pos < word_size,
                                random_state=0),
        negtive_comment.sample(word_size,
                               replace=num_neg < word_size,
                               random_state=0)
    ])
    #     打印样本个数
    print('样本总数：', balanced_words.shape[0])
    print('正样本数：', balanced_words[data['label'] == 1].shape[0])
    print('负样本数：', balanced_words[data['label'] == 0].shape[0])
    print('')
    return balanced_words
 
data_4888 = get_balanced_words(4888)

样本总数： 4888
正样本数： 2444
负样本数： 2444



样本总数： 4888
正样本数： 2444
负样本数： 2444

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF  # 原始文本转化为tf-idf的特征矩阵
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
 
# 将有标签的数据集划分成训练集和测试集
train_X,valid_X,train_y,valid_y = train_test_split(data_4888['review'],data_4888['label'],test_size=0.3,random_state=23)
 
train_X.shape,train_y.shape,valid_X.shape,valid_y.shape

((3421,), (3421,), (1467,), (1467,))

In [21]:
# 模型构建
model_tfidf = TFIDF(min_df=2, max_features=5000, ngram_range=(1,3), use_idf=1, smooth_idf=1)
# 学习idf vector
model_tfidf.fit(train_X)
# 把文档转换成 X矩阵（该文档中该特征词出现的频次），行是文档个数，列是特征词的个数
train_vec = model_tfidf.transform(train_X)
 
# 模型训练
model_SVC = LinearSVC()
clf = CalibratedClassifierCV(model_SVC)
clf.fit(train_vec,train_y)

CalibratedClassifierCV(base_estimator=LinearSVC())

In [22]:
# 把文档转换成矩阵
valid_vec = model_tfidf.transform(valid_X)
# 验证
pre_valid = clf.predict_proba(valid_vec)
 
pre_valid = clf.predict(valid_vec)
print('正例:',sum(pre_valid == 1))
print('负例:',sum(pre_valid == 0))

正例: 442
负例: 1025


In [23]:
from sklearn.metrics import accuracy_score
 
score = accuracy_score(pre_valid,valid_y)
print("准确率:",score)

准确率: 0.7034764826175869


In [24]:
posdata=pd.DataFrame(posdata).dropna(axis=0)
posdata.columns=['comment']
 
negdata=pd.DataFrame(negdata).dropna(axis=0)
negdata.columns=['comment']

In [25]:
#利用jieba中文分词 
import jieba
import jieba.posseg as psg
 
#格式转换 否则会报错  'float' object has no attribute 'decode'
df1 = pd.DataFrame(posdata.astype(str))
 
def chinese_word_cut(mytext):
    return ' '.join(jieba.cut(mytext))
 
#增加一列数据
df1['content_cutted'] = df1['comment'].apply(chinese_word_cut)
 
 
 
#格对负面评论进行操作
df2 = pd.DataFrame(negdata.astype(str))
 
def chinese_word_cut(mytext):
    return ' '.join(jieba.cut(mytext))
 
#增加一列数据
df2['content_cutted'] = df2['comment'].apply(chinese_word_cut)

In [30]:
path = '停用词汇总.txt'
f = open(path,"r",encoding='utf-8').read()
stopwords=list(f)
 
#计算TF-IDF值
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 
#设置特征数
n_features = 2000
 
 
tf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words=stopwords,
                                max_df = 0.99,
                                min_df = 0.002) #去除文档内出现几率过大或过小的词汇
tf = tf_vectorizer.fit_transform(df1.content_cutted)
 
print(tf.shape)
print(tf)

(4363, 1733)
  (0, 856)	0.09703003014893513
  (0, 1543)	0.3985651756062984
  (0, 365)	0.32450684748810255
  (0, 764)	0.25345356658225754
  (0, 1727)	0.3427323888129733
  (0, 1679)	0.1652275888721542
  (0, 1260)	0.24891737206353892
  (0, 649)	0.21193595767880338
  (0, 307)	0.3642886289248343
  (0, 247)	0.18253393657321682
  (0, 1557)	0.4029461420428512
  (0, 1538)	0.28662654844416097
  (1, 1048)	0.35319880925162583
  (1, 1654)	0.22706245851629792
  (1, 1578)	0.19891947797046905
  (1, 0)	0.23684689431833822
  (1, 1643)	0.3737215387598423
  (1, 759)	0.2191817410544371
  (1, 1702)	0.37737126750520106
  (1, 1627)	0.34207461237256465
  (1, 585)	0.3400901572538183
  (1, 958)	0.3702931455921666
  (1, 967)	0.13964728701852588
  (1, 856)	0.09282025314218026
  (2, 1661)	0.12784085889200988
  :	:
  (4360, 346)	0.5824062189353332
  (4360, 117)	0.48164002191698785
  (4360, 1029)	0.19562288769755032
  (4360, 942)	0.24226650261756333
  (4360, 43)	0.3107913486305313
  (4360, 1109)	0.2295397978502093
  

In [41]:
#LDA分析

 
#设置主题数
n_topics = 3
 
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50,
                                random_state=0)
lda.fit(tf)
 
#显示主题数 model.topic_word_
# print(lda.components_)
#几个主题就是几行 多少个关键词就是几列 
print(lda.components_.shape)                         
def print_top_words(model, tf_feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):  # lda.component相当于model.topic_word_
        print('Topic #%d:' % topic_idx)
        print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print("")
# 定义好函数之后 暂定每个主题输出前20个关键词
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)   
 


(3, 1733)
Topic #0:
我们 入住 客人 宾馆 希望 反馈 服务 他们 服务员 再次 前台 感谢您 时候 预定 问题 光临 没有 工作 一下 人员

Topic #1:
不错 房间 比较 服务 可以 方便 早餐 就是 感觉 设施 还是 环境 价格 非常 干净 没有 入住 有点 不过 一般

Topic #2:
床单 门前 花园 面对 为什么 良好 高速 西餐 一半 接送 出口 地理 超好 同等 绝佳 花园式 稍远 中心 冬天 人民



In [2]:
import pyLDAvis.gensim_models
import importlib
import pyLDAvis
importlib.reload(pyLDAvis)
import pyLDAvis.gensim_models

red_vis_data = pyLDAvis.gensim_model.prepare(lda,tf,tf_vectorizer)
pyLDAvis.display(red_vis_data)

AttributeError: module 'pyLDAvis' has no attribute 'gensim_model'