In [1]:
from pyecharts import options as opts
from pyecharts.charts import Line, Timeline, Bar, Scatter, WordCloud
from pyecharts.commons.utils import JsCode
from pyecharts.globals import SymbolType
import pandas as pd
import numpy as np
import jieba
import re

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
cleaned_data_path = r"/Users/yangchaoran/Desktop/Now learnin'/2024-2025 Sp/文本挖掘/Final/源代码及数据/data/tap_reviews cleaned.csv"
stop_word_path = r"/Users/yangchaoran/Desktop/Now learnin'/2024-2025 Sp/文本挖掘/Final/源代码及数据/data/stop_words.txt"

In [3]:
data = pd.read_csv(cleaned_data_path, encoding='utf_8_sig')

In [4]:
"""
玩家评分变化趋势
"""

table = pd.pivot_table(data, index=['updated_time'], values=['score'], aggfunc=['mean', 'count'])
table.columns = ['score_mean', 'score_count']
table['score_mean'] = np.round(table['score_mean'], 2)

table.index = pd.date_range(str(table.index[0]), str(table.index[-1]), freq='D')

In [5]:
# 评分趋势折线
data_type = {'Average score': 'score_mean', 'Scoring frequency': 'score_count'}
area_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#FFA07A'}, {offset: 1, color: '#8a2be2'}], false)"
)

time_line = Timeline(init_opts=opts.InitOpts(theme='light', width='400'))
time_line.add_schema(is_auto_play=True, play_interval=10000)

for key_, value_ in data_type.items():
    line = (
        Line(init_opts=opts.InitOpts(theme='light'))
        .add_xaxis(table.index)
        .add_yaxis('Player rating', table[value_], is_smooth=True, areastyle_opts=opts.AreaStyleOpts(
            color=JsCode(area_color_js), opacity=1), markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(
            type_='average')], linestyle_opts=opts.LineStyleOpts(color='white', type_='dashed')))
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False), markpoint_opts=opts.MarkPointOpts(
            data=[opts.MarkPointItem(type_='max', name='The highest score'), opts.MarkPointItem(type_='min', name='The lowest score')]))
        .set_global_opts(title_opts=opts.TitleOpts(title='taptap scoring trend', subtitle='2021-09-15:00 to 2023-03-18:00',
            title_textstyle_opts=opts.TextStyleOpts(
            color="#fff", font_size=16)), xaxis_opts=opts.AxisOpts(type_='time'), yaxis_opts=opts.AxisOpts(
            type_='value', min_=8), tooltip_opts=opts.TooltipOpts(trigger='axis', axis_pointer_type='cross')))
    time_line.add(line, key_)

time_line.render_notebook()

In [6]:
"""
点评次数前15的设备——活跃玩家使用的机型，应重点关注
"""
# 设定score<8为负向
data['negative'] = data['score'].apply(lambda x: 1 if x < 8 else 0)

table = pd.pivot_table(data, index=['device'], values=['score', 'negative', 'spent'], aggfunc={
    'score': 'count', 'negative': np.sum, 'spent': 'mean'}, fill_value=0)
table.columns = ['negative_count', 'device_count', 'spent_mean']
sub_table1 = table.sort_values('device_count', ascending=False)[:15]
sub_table1['positive_count'] = sub_table1['device_count'] - sub_table1['negative_count']
sub_table1['spent_mean'] /= 60

  table = pd.pivot_table(data, index=['device'], values=['score', 'negative', 'spent'], aggfunc={


In [15]:
# 评论数量堆叠柱状图&单人游玩时间折线
bar = (Bar(init_opts=opts.InitOpts(theme='light'))
       .add_xaxis(sub_table1.index.tolist())
       .add_yaxis('Positive review', sub_table1['positive_count'].tolist(), stack='stack1', category_gap='40%', yaxis_index=1, itemstyle_opts={'color': '#6A5ACD'})
       .add_yaxis('Negative review', sub_table1['negative_count'].tolist(), stack='stack1', category_gap='40%', yaxis_index=1, itemstyle_opts={'color': '#FF69B4'})
       .extend_axis(yaxis=opts.AxisOpts(name='Number of reviews', type_='value', min_=0, max_=250, position='right',
                                        axislabel_opts=opts.LabelOpts(formatter='{value} 次')))
       .extend_axis(yaxis=opts.AxisOpts(type_='value', position='left',
                                        axislabel_opts=opts.LabelOpts(formatter='{value} 小时'),
                                        splitline_opts=opts.SplitLineOpts(is_show=True, linestyle_opts=opts.
                                                                          LineStyleOpts(opacity=1))))
       .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
       .set_global_opts(title_opts=opts.TitleOpts(title='TOP15 ranking devices', subtitle='2021-09-15:00 to 2023-03-18:00'),
                        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=25, font_size=11)),
                        datazoom_opts=opts.DataZoomOpts(), tooltip_opts=opts.TooltipOpts(trigger='axis',
                        axis_pointer_type='cross'))
       )
line2 = (Line(init_opts=opts.InitOpts(theme='light'))
         .add_xaxis(sub_table1.index.tolist())
         .add_yaxis('Single play duration', np.round(sub_table1['spent_mean'].tolist(), 2), yaxis_index=2, linestyle_opts=opts.LineStyleOpts(color='#FF1493'))
         )
bar.overlap(line2).render_notebook()

In [17]:
table = pd.pivot_table(data, index=['device'], values=['score'], aggfunc={'score': ['mean', 'count']})
table.columns = ['device_count', 'score_mean']
sub_table2 = table.sort_values('score_mean')
sub_table2 = sub_table2[sub_table2['device_count'] >= 20][:10].sort_values('device_count')

In [18]:
scatter = (
    Scatter(init_opts=opts.InitOpts(theme='light'))
    .add_xaxis(sub_table2.index.tolist())
    .add_yaxis('Player rating',
               [list(z) for z in zip(np.round(sub_table2['score_mean'], 2), sub_table2.index, sub_table2['device_count'])],
               label_opts=opts.LabelOpts(formatter=JsCode("function(params){return params.value[3]+'人：'+params.value[1];}")),
               symbol_size=20)
    .set_global_opts(title_opts=opts.TitleOpts(
        title='TOP10 devices with low scores', subtitle='2021-09-15:00 to 2023-03-18:00, comments only &gt; =20 devices'),
        yaxis_opts=opts.AxisOpts(type_='value', min_=6.5), visualmap_opts=opts.VisualMapOpts(
        type_='color', max_=sub_table2['device_count'].max(), min_=sub_table2['device_count'].min()),
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20, font_size=11)), datazoom_opts=opts.DataZoomOpts())
)
scatter.render_notebook()

In [10]:
"""
词云图（TOP500支持度和热度达到0.5的评论）——玩家最关注点和对游戏的直观印象

需要做以下几步（含一些文本挖掘的预处理程序）：
1.将文本合并成一个字符串 2.去除非中文字符 3.进行中文分词  4.去除中文停用词  5.统计词频  6.用pyechart绘图
"""
# 去除非中文字符
def drop_non_chinese(text):
    """
    :param text: str,含非中文的字符串
    :return: str,不含非中文的字符串
    """
    # \u4e00-\u9fa5是中文字符的范围，中括号内^表示取反
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese_text = re.sub(pattern, '', text)
    return chinese_text

# 中文分词
def split_text(chinese_text, cut_all=False):
    """
    :param chinese_text: str,中文文本
    :param cut_all: 是否采用全模式分词
    :return: str，用空格分隔的分词文本
    """
    text_generater = jieba.cut(chinese_text, cut_all=cut_all)
    result = ' '.join(text_generater)
    return result

# 去除中文停用词
def drop_stop_words(st_path, chinese_text):
    # 载入停用词列表
    with open(st_path, 'r', encoding='gbk') as f:
        st = f.read()
        st_list = st.splitlines()
    word_list = chinese_text.split()

    # 删除停用词
    for stop_word in st_list:
        word_list = [word for word in word_list if word != stop_word]

    return word_list

# 统计词频
def caculate_words(word_list, threshold=50):
    count_dict = {}
    for word in word_list:
        count_dict[word] = count_dict[word] + 1 if word in count_dict else 1

    count_dict = {k: v for k, v in count_dict.items() if v >= threshold}
    keyword_list = list(count_dict.items())

    return keyword_list

In [11]:
# 截取并合并评论
data = data.sort_values('net_support', ascending=False)
data.reset_index(inplace=True)
sub_contents = data['contents'][(data.index<500)|(data['heat']>=0.5)]
text = str()
for content in sub_contents:
    text += content
print('文本长度:', len(text))

文本长度: 133202


In [12]:
# 去除非中文字符
text = drop_non_chinese(text)
# 中文分词
text = split_text(text, cut_all=False)
# 去除中文停用词
word_list = drop_stop_words(stop_word_path, text)
# 统计词频
keyword_list = caculate_words(word_list, threshold=50)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/9p/6kysksz51c5dp1tvm7_wk8xm0000gn/T/jieba.cache
Loading model cost 0.646 seconds.
Prefix dict has been built successfully.


In [13]:
cloud = (WordCloud()
         .add('', keyword_list, word_size_range=[20, 100])
         )
cloud.render_notebook()