In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data
column_names = ['Votes', 'Useful', 'User', 'Watched', 'Score', 'Date', 'Comment']
data = pd.read_csv('./data/comments_clean.csv', header=None, names=column_names, skipinitialspace = True, quotechar = '`')
# set value as string
data['Votes'] = data['Votes'].astype(str)
data['Useful'] = data['Useful'].astype(str)
data['User'] = data['User'].astype(str)
data['Watched'] = data['Watched'].astype(str)
data['Score'] = data['Score'].astype(str)
data['Date'] = data['Date'].astype(str)
data['Comment'] = data['Comment'].astype(str)

In [None]:
# clean up
data = data[data['Score'].map(len) == 6]
data = data[data['Score'] != '看过']
data = data[data['Date'].map(len) == 19]
print('rows:', data.shape[0], ', columns: ', data.shape[1]) # count rows of total dataset

In [None]:
data.head()

In [None]:
data['Score'].value_counts()

In [None]:
data['Watched'].value_counts()

In [None]:
data['Useful'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
index = np.arange(5)
score_counts = data['Score'].value_counts()
values = (score_counts[0], score_counts[1], score_counts[2], score_counts[4], score_counts[3])
bar_width = 0.35
plt.figure(figsize=(20, 10))
plt.bar(index, values, bar_width, alpha=0.6, color='rgbym')
plt.xlabel('Score', fontsize=16)  
plt.ylabel('Counts', fontsize=16)
plt.title('Comments Level', fontsize=18)  
plt.xticks(index, ('5-star', '4-star', '3-star', '2-star', '1-star'), fontsize=14, rotation=20)
plt.ylim(0, 90000)
plt.grid()
for idx, value in zip(index, values):
    plt.text(idx, value + 0.1, '%d' % value, ha='center', va='bottom', fontsize=14, color='black')
plt.show()

In [None]:
import re
import jieba
def segment_words(stars):
    comments = None
    if stars == 'all':
        comments = data['Comment']
    else:
        comments = data[data['Score'] == stars]['Comment']
    comments_list = []
    for comment in comments:
        comment = str(comment).strip().replace('span', '').replace('class', '').replace('emoji', '')
        comment = re.compile('1f\d+\w*|[<>/=]').sub('', comment)
        if (len(comment) > 0):
            comments_list.append(comment)
    text = ''.join(comments_list)
    word_list = jieba.cut(text, cut_all=True)
    '''
    stopwords_list = []
    # load chinese stop words
    with open('./data/中文停用词表(1208个).txt') as file:
        for line in file:
            stopwords_list.append(line.strip())
    print(len(stopwords_list))
    with open('./data/停用词表.txt') as file:
        for line in file:
            line = line.strip()
            if line not in stopwords_list:
                stopwords_list.append(line)
    print(len(stopwords_list))
    # remove stop words from word_list
    word_list = [word for word in word_list if word not in stopwords_list]
    '''
    words = ' '.join(word_list)
    return words

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
import PIL.Image as Image
def plot_word_cloud(words):
    coloring = np.array(Image.open('./data/chinese.jpg'))
    wc = WordCloud(background_color='white', max_words=2000, mask=coloring, max_font_size=60, random_state=42, 
                   font_path='./data/DroidSansFallbackFull.ttf', scale=2).generate(words)
    image_color = ImageColorGenerator(coloring)
    plt.figure(figsize=(32, 16))
    plt.imshow(wc.recolor(color_func=image_color))
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

In [None]:
all_words = segment_words('all')
plot_word_cloud(all_words)

In [None]:
five_start_words = segment_words('力荐')
plot_word_cloud(five_start_words)

In [None]:
four_start_words = segment_words('推荐')
plot_word_cloud(four_start_words)

In [None]:
three_start_words = segment_words('还行')
plot_word_cloud(three_start_words)

In [None]:
two_start_words = segment_words('较差')
plot_word_cloud(two_start_words)

In [None]:
one_start_words = segment_words('很差')
plot_word_cloud(one_start_words)