#### 生成虚假评论

In [5]:
import random


WINDOW_SIZE = 10
PUNCTUATION_MARK = ['x']  # 标点
PUNCTUATION = ['。', '！', '？', '，', '～']
NOUN_MARK = ['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'nz']  # 名词
VERB_MARK = ['v', 'vd', 'vg', 'vi', 'vn', 'vq']  # 动词
ADJECTIVE_MARK = ['a', 'ad', 'an', 'ag']  # 形容词
ADVERB_MARK = ['d', 'df', 'dg']  # 副词
ENG_MARK = ['eng']

EMOJI = ['😀', '😁', '😂', '😃', '😄', '😆','😉', '😊',
         '😋', '😎', '😍', '😘', '😗', '😙', '😚', '😇',
         '😏', '😝']

YANWENZI = ['ヽ(✿ﾟ▽ﾟ)ノ', 'φ(≧ω≦*)♪', '╰(*°▽°*)╯', 'o(￣▽￣)ｄ', 'o( =•ω•= )m']

ILLEGAL_WORD = ['考拉', '网易']  # '不过', '因为', '而且', '但是', '但', '所以', '因此', '如果',


RESERVED_MARK = NOUN_MARK + VERB_MARK + ADJECTIVE_MARK + ADVERB_MARK + ENG_MARK # 用于发现新词
ASPECT_MARK = NOUN_MARK + VERB_MARK


def text2seg_pos(seg_pos_text, pattern='[。！？]'):
    """
    经过分词的文档，原始一条用户评论通过指定的标点符号分成多个句子
    """
    seg_list = []  # 保存全部按标点切分的seg
    pos_list = []  # 保存全部按标点切分的pos
    seg_review_list = []  # 用户完整的一条评论
    for line in seg_pos_text:
        line = line.strip()
        line = line.split(' ')
        seg_sub_list = []
        pos_sub_list = []
        cur_review = []
        for term in line:
            word, flag = term.split('/')
            cur_review.append(word)
            if word in pattern:
                seg_sub_list.append(word)
                pos_sub_list.append(flag)
                seg_list.append(list(seg_sub_list))
                pos_list.append(list(pos_sub_list))
                seg_sub_list = []
                pos_sub_list = []
            else:
                seg_sub_list.append(word)
                pos_sub_list.append(flag)
        seg_review_list.append(list(cur_review))
    return seg_list, pos_list, seg_review_list


def get_candidate_aspect(seg_list, pos_list, adj_word, stop_word, word_idf):
    """
    输入的数据为用逗号隔开的短句，
    利用开窗口的方式，根据情感词典抽名词得到候选的aspect
    """
    print("利用情感词典抽取候选aspect...")
    aspect_dict = {}
    print(adj_word[:100])
    for i, sentence in enumerate(seg_list):
        for j, word in enumerate(sentence):
            if word in adj_word:
                #print(1)
                if pos_list[i][j] in ADJECTIVE_MARK:  # 当前的词属于情感词且词性为形容词
                    #print(2)
                    startpoint = j - WINDOW_SIZE
                    startpoint = startpoint if startpoint >= 0 else 0
                    for k in range(startpoint, j):
                        if pos_list[i][k] in ASPECT_MARK:
                            #print(3)
                            if seg_list[i][k] in aspect_dict:
                                aspect_dict[seg_list[i][k]] += 1
                            else:
                                aspect_dict[seg_list[i][k]] = 1

    temp = aspect_dict.items()
    #print(temp)
    temp = list(filter(lambda x: len(x[0]) > 1, temp))  # 经过词组发现之后，删去一个字的词
    temp = [item[0] for item in temp if item[0] not in stop_word]  # 删去停用词
    #print(temp)
    temp = [item for item in temp if word_idf[item] != 0]  # 删去IDF值为0的词
    #print(2,temp)
    aspect_list = temp
    print("---aspect抽取完成，共抽取到%s个候选aspect---" % (len(aspect_list)))
    return aspect_list


class NSDict:
    """
    用来构建候选集（aspect，opinion，pattern）
    """
    def __init__(self, seg_list, pos_list, raw_aspect_list):
        self.seg_list = seg_list
        self.pos_list = pos_list
        self.raw_aspect_list = raw_aspect_list
        self.ns_dict = {}
        self.aspect_do_not_use = []
        self.opinion_do_not_use = ["最", "不", "很"]
        self.pattern_do_not_use = ["的-", "和-", "和+", "而+", "而-", "又+", "又-", "而且+", "而且-"]

    def _seg2nsd(self, aspect_for_filter):
        for x, clue in enumerate(self.seg_list):
            N_list = []
            S_list = []
            word_list = clue
            for y, word in enumerate(clue):
                if word in aspect_for_filter:
                    N_list.append(y)
                elif self.pos_list[x][y] in ADJECTIVE_MARK:
                    S_list.append(y)
            if N_list and S_list:
                self._make_nsdict(word_list, N_list, S_list)

    def _make_nsdict(self, word_list, N_list, S_list):
        for n in N_list:
            for s in S_list:
                if (1 < n - s < WINDOW_SIZE + 1) or (1 < s - n < WINDOW_SIZE + 1):  # 窗口大小是5
                    if word_list[n] not in self.ns_dict:
                        self.ns_dict[word_list[n]] = {}
                    if word_list[s] not in self.ns_dict[word_list[n]]:
                        self.ns_dict[word_list[n]][word_list[s]] = {}
                    if n > s:
                        patt = ' '.join(word_list[s + 1: n]) + '+'
                    else:
                        patt = ' '.join(word_list[n + 1: s]) + '-'
                    if patt not in self.ns_dict[word_list[n]][word_list[s]]:
                        self.ns_dict[word_list[n]][word_list[s]][patt] = 0.
                    self.ns_dict[word_list[n]][word_list[s]][patt] += 1.

    def _noise_del(self):
        for aspect in self.aspect_do_not_use:
            self._noise(aspect, self.ns_dict)
        for n in self.ns_dict:
            for opinion in self.opinion_do_not_use:
                self._noise(opinion, self.ns_dict[n])
            for s in self.ns_dict[n]:
                for pattern in self.pattern_do_not_use:
                    self._noise(pattern,self.ns_dict[n][s])

    def _noise(self, str, dict):
        if str in dict:
            del dict[str]

    def build_nsdict(self):
        print("stage 1：抽取pair和pattern...")
        self._seg2nsd(self.raw_aspect_list)
        self._noise_del()
        print("\tDone")
        return self.ns_dict


class PairPattSort:
    '''
    Pair-Patt-Count structure
    '''
    def __init__(self, ns_dict):
        self._get_map(ns_dict)

    def _get_map(self, ns_dict):
        '''
        get map: [pair-patt], [patt-pair], [pair](score), [patt](score)

        :param ns_dict: Entity.str { Emotion.str { Pattern.str { Count.int (It's a three-level hash structure)
        :return:
        '''
        pair_list = []
        patt_dict = {}
        patt_pair_map = {}
        pair_patt_map = {}

        aspects = list(ns_dict.keys())
        aspects.sort()

        for n in aspects:
            for s in ns_dict[n]:
                n_s = "{}\t{}".format(n, s)   #这里存的pair是字符串，中间用\t隔开
                pair_list.append(n_s)
                pair_patt_map[n_s] = {}
                for patt in ns_dict[n][s]:
                    if patt not in patt_dict:
                        patt_dict[patt] = 1.0
                    pair_patt_map[n_s][patt] = ns_dict[n][s][patt]
                    if patt in patt_pair_map:
                        patt_pair_map[patt][n_s] = ns_dict[n][s][patt]
                    else:
                        patt_pair_map[patt] = {}
                        patt_pair_map[patt][n_s] = ns_dict[n][s][patt]
        self.patt_pair_map = patt_pair_map
        self.pair_patt_map = pair_patt_map
        self.pair_len = len(pair_list)
        self.patt_len = len(patt_dict)
        self.pair_score = dict([(word, 1.) for i, word in enumerate(pair_list)])
        self.patt_score = patt_dict

    """"正则化，和为score_len"""
    def _norm(self, score_dict, score_len):
        sum_score = 0.
        for s in score_dict:
            sum_score += score_dict[s]
        for s in score_dict:
            score_dict[s] = score_dict[s] / sum_score * score_len
        return score_dict

    def _patt_pair(self):
        for pair in self.pair_patt_map:  # <- 循环遍历每个pair
            value = 0.
            for patt in self.pair_patt_map[pair]:  # <- 每个pair中的pattern出现的个数 * 这个pattern的score，然后求和得到这个pair的分数
                value += self.pair_patt_map[pair][patt] * self.patt_score[patt]
            self.pair_score[pair] = value

    def _pair_patt(self):
        for patt in self.patt_pair_map:  # <- 遍历每个pattern
            value = 0.
            for pair in self.patt_pair_map[patt]:  # <- 每个被pattern修饰的pair出现的个数 * 这个pair的score，然后求和得到这个pattern1的
                value += self.patt_pair_map[patt][pair] * self.pair_score[pair]
            self.patt_score[patt] = value
    def _patt_correct(self):
        self.patt_score['的-'] = 0.0

    def _iterative(self):
        '''
        A complete iteration
        [pair] = [patt-pair] * [patt]
        [patt] = [pair-patt] * [pair]
        :return:
        '''
        self._patt_pair()
        self.pair_score = self._norm(self.pair_score, self.pair_len)
        self._pair_patt()
        self.patt_score = self._norm(self.patt_score, self.patt_len)

    def sort_pair(self):
        print("stage 2：组合排序...")
        for i in range(100):
            self._iterative()
        pair_score = sorted(self.pair_score.items(), key=lambda d: d[1], reverse=True)
        print('\tDone')
        print("---pair抽取完成---")
        return pair_score


def get_aspect_express(seg_review_list, pair_useful):
    """
    抽取原始评论中的aspect作为输入，完整的评论作为输出
    """

    def check_sentence(sentence):
        """
        判断短句是否合法
        """
        _s = ''.join(sentence)
        legal = True
        if len(_s) > 30:
            legal = False
        return legal

    raw_aspect_express = {k: [] for k in pair_useful}  # 用户关于某个观点的一段原始表达
    raw_aspect_express_count = {k: 0 for k in pair_useful}  # 记录某个观点表达出现的次数
    for review in seg_review_list:  # 每个sentence就是一句完整的review

        source = []  # 训练的src
        if review[-1] not in PUNCTUATION:
            review.append('。')
        target = review  # 训练的tgt

        # 对于单个review进行切分
        cur_review = []
        pre_end = 0
        for i, _ in enumerate(review):
            if review[i] in ['。', '！', '？', '，', '～']:
                cur_review.append(review[pre_end:i + 1])
                pre_end = i + 1
            elif i == len(review) - 1:
                cur_review.append(review[pre_end:])

        for sentence in cur_review:  # sentence 是两个标点之间的短句
            if sentence[-1] not in PUNCTUATION:
                sentence.append('。')
            find_opinion_flag = False
            for idx, word in enumerate(sentence):
                if find_opinion_flag:  # 如果在当前的短句中已经找到了一组观点表达就结束对这个短句的搜索
                    break
                if word in pair_useful:  # 当前的word属于aspect
                    # 向前开窗口
                    startpoint = idx - WINDOW_SIZE if idx - WINDOW_SIZE > 0 else 0
                    for i in range(startpoint, idx):  # 寻找opinion word
                        cur_word = sentence[i]
                        if cur_word in pair_useful[word] and sentence[i + 1] == "的":  # eg. 超赞的一款面膜
                            if check_sentence(sentence):
                                raw_aspect_express[word].append(sentence)
                                raw_aspect_express_count[word] += 1
                                find_opinion_flag = True  # 只要找到一个opinion word就算命中一个短句了

                    # 向后开窗口
                    endpoint = idx + WINDOW_SIZE if idx + WINDOW_SIZE < len(sentence) else len(sentence)
                    for i in range(idx + 1, endpoint):
                        cur_word = sentence[i]
                        if cur_word in pair_useful[word]:
                            if check_sentence(sentence):
                                raw_aspect_express[word].append(sentence)
                                raw_aspect_express_count[word] += 1
                                find_opinion_flag = True  # 只要找到一个opinion word就算命中一个短句了

    # 筛选得到保留的aspect
    aspect_express = {}
    for aspect in raw_aspect_express:
        if raw_aspect_express_count[aspect] < 5:
            continue
        aspect_express[aspect] = raw_aspect_express[aspect]

    return aspect_express


def merge_aspect_express(aspect_express, pair_useful):
    """
    对相似的观点表达进行合并, 同时输出最终的aspect_opinion_pair
    """
    aspects = list(aspect_express.keys())
    length = len(aspects)
    aspects.sort()  # 排成字典序
    merged_aspects = [[aspects[0]]]
    merged_express = {}
    opinion_set = []

    def check_is_same(word1, word2):
        """
        判断两个词当中是否存在相同的字
        """
        for i in word1:
            if i in word2:
                return True
        return False

    for i in range(1, length):
        if check_is_same(merged_aspects[-1][-1], aspects[i]):
            merged_aspects[-1].append(aspects[i])
        else:
            merged_aspects.append([aspects[i]])

    for a_list in merged_aspects:

        # 收集全部的形容词
        for i in a_list:
            opinion_set += pair_useful[i]

        _l = ','.join(a_list)
        merged_express[_l] = []
        for i in a_list:
            merged_express[_l] += aspect_express[i]

    opinion_set = set(opinion_set)

    return merged_express, opinion_set


def build_dataset_express(seg_review_list, pair_useful):
    """
    抽取原始评论中的aspect作为输入，完整的评论作为输出
    """
    train_data = []  # 记录训练数据
    for review in seg_review_list:  # 每个sentence就是一句完整的review

        source = []  # 训练的src
        if review[-1] not in PUNCTUATION:
            review.append('。')
        target = review  # 训练的tgt

        # 对于单个review进行切分
        cur_review = []
        pre_end = 0
        for i, _ in enumerate(review):
            if review[i] in ['。', '！', '？', '，', '～']:
                cur_review.append(review[pre_end:i + 1])
                pre_end = i + 1
            elif i == len(review) - 1:
                cur_review.append(review[pre_end:])

        for sentence in cur_review:  # sentence 是两个标点之间的短
            if sentence[-1] not in PUNCTUATION:
                sentence.append('。')
            find_opinion_flag = False
            for idx, word in enumerate(sentence):
                if find_opinion_flag:  # 如果在当前的短句中已经找到了一组观点表达就结束对这个短句的搜索
                    break
                if word in pair_useful:  # 当前的word属于aspect
                    source.append(word)
                    find_opinion_flag = True  # 只要找到一个opinion word就算命中一个短句了

        train_data.append((list(source), target))

    max_source_length = 0
    # 筛选训练数据
    def check_review(item):
        """
        判断当前review是否合法
        """
        source = item[0]
        tgt = item[1]
        legal = True
        _s = ''.join(tgt)
        if len(source) == 0 or len(source) > 5:  # 不含有观点表达或者观点词太多
            legal = False
        unique_source = set(source)
        if len(unique_source) != len(source):
            legal = False
        if len(_s) > 60:
            legal = False
        return legal

    legal_train_data= []
    for item in train_data:
        if check_review(item):
            max_source_length = max(max_source_length, len(item[0]))
            legal_train_data.append(item)

    print('max source length:%s' % max_source_length)
    return legal_train_data


def generate_reviews(aspect_express, num=1000):
    """
    根据候选集合生成假评论
    """
    all_aspect = list(aspect_express.keys())
    print('Aspect:{}'.format(all_aspect))
    print()

    # 根据不同aspect出现的概率分配不同权重
    aspect_length_dict = {}
    for a in aspect_express:
        aspect_length_dict[a] = len(aspect_express[a])
    weight_aspect_list = []
    for aspect in aspect_length_dict:
        weight_aspect_list += [aspect] * aspect_length_dict[aspect]

    res = []
    for _ in range(num):
        num_aspect = random.choice([1, 2, 3, 4, 5, 6])
        review = []
        used_aspect = []
        for _ in range(num_aspect):
            a = random.choice(weight_aspect_list)
            while a in used_aspect:
                a = random.choice(weight_aspect_list)
            used_aspect.append(a)
            a_s = random.choice(aspect_express[a])
            a_s = a_s[:-1] + ['#']  # 丢掉标点，换位#作为切分点
            review += a_s
        res.append(review)

    return res


def fake_review_filter(reviews, opinion_set):
    """
    筛去评论中不像人写的句子：如果同一个形容词重复出现两次就判定为假评论，同时筛去长度超过60的评论
    """
    results = []
    for review in reviews:
        opinion_used = {k: 0 for k in opinion_set}
        flag = True
        for word in review:
            if word in ILLEGAL_WORD:
                flag = False
            if word in opinion_used:
                opinion_used[word] += 1
                if opinion_used[word] >= 2:
                    flag = False
                    # print('Fake:{}'.format(''.join(review)))
                    break
        if flag:
            _s = ''.join(review)
            _s = _s.split('#')  # 最后一个是空字符
            review = ''
            pu = ['，'] * 100 + ['～'] * 20 + ['！'] * 20 + EMOJI + YANWENZI
            random.shuffle(pu)
            for a_s in _s:
                if a_s:
                    review += a_s + random.choice(pu)
            if not review:
                print('error:')
                print(review)
            review = review[:-1] + '。'
            results.append(review)
            print('\t' + review)

    return results


In [6]:
# 利用抽取的方法从原始评论中抽取含有观点表达的短句，并随机组合一些含有观点表达的短句组成一句仿真评论。
with open('../data(temp)/review_cleaned.txt', 'r',encoding='utf-8') as f:
    str_text = f.read()
with open('../data(temp)/review_seg_post.txt', 'r',encoding='utf-8') as f:
    seg_pos_text = f.readlines()

# 加载IDF表
word_idf = {}
with open('../data(temp)/review_idf.txt', 'r',encoding='utf-8') as f:
    for line in f.readlines():
        word, idf = line.strip().split(' ')
        word_idf[word] = float(idf)

# 加载停用词
stop_word = []
with open('../data/stopword.txt', 'r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_word.append(line.strip())

# 加载正向情感词典
pos_adj_word = []
with open('../data/HowNetPOSWord.txt', 'r',encoding='utf-8') as f:
    for line in f.readlines():
        pos_adj_word.append(line.strip())

seg_list, pos_list, seg_review_list = text2seg_pos(seg_pos_text, pattern='[。！？，～]')
raw_aspect_list = get_candidate_aspect(seg_list, pos_list, pos_adj_word, stop_word, word_idf)
# 构建候选集合
N = NSDict(seg_list, pos_list, raw_aspect_list)
ns_dict = N.build_nsdict()
# 候选集合排序
P = PairPattSort(ns_dict)
pair_score = P.sort_pair()
# 得到正确的观点表达候选
pair_useful = {}
baseline = 0.1 * len(pair_score)
for i, item in enumerate(pair_score):
    if i <= baseline:
        aspect, opinion = item[0].split('\t')
        if aspect in pair_useful:
            pair_useful[aspect].append(opinion)
        else:
            pair_useful[aspect] = [opinion]

# 从原始评论中抽取观点表达
aspect_express = get_aspect_express(seg_review_list, pair_useful)

# 字符匹配合并aspect
merged_aspect_express, opinion_set = merge_aspect_express(aspect_express, pair_useful)

# 生成假评论
generated_raw_reviews = generate_reviews(merged_aspect_express)

results = fake_review_filter(generated_raw_reviews, opinion_set)

with open('../results/generated_reviews.txt', 'w',encoding='utf-8') as f:
    for c in results:
        f.write(c + '\n')

利用情感词典抽取候选aspect...
['侻', '俶傥', '僄', '劼', '嫕', '惇', '惓惓', '會', '牣', '皞', '挨边', '哀婉', '哀艳', '蔼', '蔼蔼', '蔼然', '蔼然可亲', '蔼如', '艾', '碍事', '安', '安安静静', '安安稳稳', '安定', '安顿', '安分', '安好', '安静', '安康', '安澜', '安乐', '安宁', '安全', '安然', '安然无事', '安然无恙', '安如磐石', '安如泰山', '安生', '安适', '安泰', '安恬', '安稳', '安详', '安逸', '安谧', '按期', '按时', '昂昂', '昂藏', '昂然', '傲然', '奥', '奥博', '奥妙', '巴结', '拔尖', '拔尖儿', '拔俗', '把牢', '把稳', '把细', '白', '白纸黑字', '百不一失', '百读不厌', '百废俱兴', '百里挑一', '百炼成钢', '百听不厌', '百无一失', '百依百顺', '百折不回', '百折不挠', '百折不挠地', '板板正正', '板实', '板正', '半公开', '半透明', '膀大腰圆', '膀阔腰圆', '棒', '磅礴', '苞', '睆', '保存良好', '保皇', '保险', '保准', '饱学', '宝', '宝贵', '悲壮', '卑', '倍儿棒', '备细', '奔放', '本质', '笨鸟先飞']
---aspect抽取完成，共抽取到393个候选aspect---
stage 1：抽取pair和pattern...
	Done
stage 2：组合排序...
	Done
---pair抽取完成---
Aspect:['东西', '产品,产品质量', '价格', '体验,体验感', '力度', '包装', '发现,发质', '味道', '商品', '头发,头皮', '好用', '宝贝', '客服', '心情', '性价比', '感觉', '成分', '护发素', '控油', '效果', '时间', '服务态度', '气味', '油头', '泡沫', '洗出来,洗发水,洗完,洗感,洗起来', '清洁力', '留香', '能力', '脱发', '质量', '追评', '颜值', 

pair_useful保存为csv(便于展示)

In [7]:
print(pair_useful)
import csv  

# 将值列表转换为字符串  
pair_useful_str = {key: ' '.join(values) for key, values in pair_useful.items()}  

# 写入 CSV 文件  
with open('../data(temp)/pair_useful.csv', mode='w', newline='', encoding='utf-8') as csvfile: 
    writer = csv.writer(csvfile)  
    writer.writerow(['Key', 'Value'])  # 写入列名  

    # 遍历字典并写入每个键及其对应的值字符串  
    for key, value_str in pair_useful_str.items():  
        writer.writerow([key, value_str])  # 写入键和合并后的字符串值  

{'效果': ['好', '不错', '明显', '差'], '头发': ['蓬松', '柔顺', '清爽', '香', '容易', '舒服', '柔软', '干净', '滑顺', '干爽', '好', '枯燥', '轻'], '味道': ['香', '好', '清新', '不错', '清淡', '甜腻'], '气味': ['香', '好', '不错', '清新', '舒服', '清爽', '细腻', '清淡', '浓厚'], '性价比': ['高'], '包装': ['好', '仔细', '严实', '精致', '完整', '完好', '精美', '严密', '一般', '干净'], '质量': ['好'], '洗完': ['蓬松', '清爽', '柔顺', '舒服', '香', '干痒'], '洗发水': ['好', '香', '不错', '温和', '细腻', '清爽'], '头皮': ['清爽', '舒服', '干净', '舒适', '舒爽'], '产品': ['好', '不错'], '感觉': ['不错', '清爽', '舒服', '好', '一般', '舒适', '蓬松', '明显', '香'], '泡沫': ['丰富', '细腻'], '服务态度': ['好'], '东西': ['好', '不错'], '能力': ['强', '不错', '好'], '商品': ['好', '不错'], '清洁力': ['强'], '控油': ['好', '持久', '清爽', '彻底'], '香味': ['持久', '清新', '好'], '力度': ['强'], '颜值': ['高'], '态度': ['好', '积极'], '宝贝': ['好', '不错'], '产品质量': ['好'], '质地': ['好'], '上头': ['温和'], '追评': ['不错', '好'], '油头': ['友好', '好'], '体验感': ['好', '差'], '发质': ['好', '差'], '客服': ['好'], '成分': ['温和', '安全', '好'], '好闻': ['香'], '产品包装': ['好'], '变得': ['蓬松', '柔顺'], '吸收': ['好'], '商品质量': ['好'], '好用': ['不错', '香', '柔顺'], 

ns_dict保存为csv(便于展示，可以不运行)

In [8]:
import csv  
print(ns_dict)
# 定义要保存的 CSV 文件名  
csv_file_name = '../data(temp)/ns_dict.csv'  

# 打开 CSV 文件进行写入  
with open(csv_file_name, mode='w', newline='', encoding='utf-8') as csv_file:  
    writer = csv.writer(csv_file)  
    
    # 写入标题行  
    writer.writerow(['对象', 'pattern', '观点词', '个数'])  
    
    # 遍历每个对象、观点词和对应的 pattern 和个数  
    for obj, opinions in ns_dict.items():  
        for opinion, patterns in opinions.items():  
            for pattern, count in patterns.items():  
                # 写入四列数据  
                writer.writerow([obj, pattern, opinion, count])  
data = []  

# 从CSV中读取数据  
with open(csv_file_name, mode='r', encoding='utf-8') as csv_file:  
    reader = csv.reader(csv_file)  
    
    # 读取标题行  
    header = next(reader)  
    
    # 读取其他行  
    for row in reader:  
        data.append(row)  

# 随机打乱数据行 
random.seed(10) 
random.shuffle(data)  

# 将打乱后的数据写入新的CSV文件  
with open(csv_file_name, mode='w', newline='', encoding='utf-8') as csv_file:  
    writer = csv.writer(csv_file)  

    # 写入标题行  
    writer.writerow(header)  
    
    # 写入打乱后的数据  
    writer.writerows(data)  


print(f"数据已保存到 {csv_file_name}")

{'头发': {'蓬松': {'立马-': 1.0, '又 巨显 发量 的+': 1.0, '能-': 1.0, '变得-': 1.0, '的+': 3.0, '很-': 61.0, '真的 很 柔顺 很-': 1.0, '都 很-': 5.0, '比较-': 2.0, '感觉 很-': 1.0, '也 很-': 7.0, '真的 挺-': 1.0, '比 以前-': 1.0, '蓬蓬 的+': 1.0, '依旧-': 1.0, '更-': 1.0, '轻盈 又-': 2.0, '贼 清爽-': 1.0, '柔顺-': 3.0, '特别 的-': 2.0, '都-': 2.0, '洗完 真的 很-': 1.0, '真的 很-': 6.0, '之后 很 立体-': 1.0, '越来越-': 3.0, '还-': 1.0, '也 超级-': 1.0, '还是-': 1.0, '清爽-': 3.0, '超级-': 2.0, '瞬间 清爽-': 1.0, '干净 又-': 1.0, '就 会-': 1.0, '轻盈-': 1.0, '巨-': 2.0, '好 几天 都 是-': 1.0, '还是 很 清爽-': 1.0, '洗完 第一天 一点 也 不-': 1.0, '既-': 1.0, '很 顺滑-': 2.0, '很 清爽-': 1.0, '也 炒 鸡 的-': 1.0, '非常 的-': 4.0, '有 多 去 油腻 、 有 多-': 1.0, '就 头发 都 巨-': 1.0, '都 巨-': 1.0, '特别-': 3.0, '也-': 2.0, '就 很-': 1.0, '变得 很-': 1.0, '顺滑-': 1.0, '非常-': 4.0, '都 能-': 1.0, '一 开始 是 有 一点点-': 1.0, '使用 起来 这个-': 1.0, '的 洗完+': 1.0, '特别 顺滑-': 1.0, '后 很-': 1.0, '确实 很-': 2.0, '一 整个-': 1.0, '就 没有 支棱 起来 过来 现在 头发 都 是-': 1.0, '都 是-': 1.0, '后 挺-': 1.0, '之后 非常-': 1.0, '自然-': 1.0, '感觉 很 清爽 也 很-': 1.0, '感觉 至少-': 1.0, '洗 后 很-': 1.0, '也 

pair_score保存为csv(便于展示)

In [9]:
print('pair_score',pair_score[:20])
with open('../data(temp)/pair_score.csv', mode='w', newline='', encoding='utf-8') as csvfile:  
    writer = csv.writer(csvfile)  
    writer.writerow(['Pair', 'Score'])  # 写入列名  
    writer.writerows(pair_score)  # 写入数据  

pair_score [('效果\t好', 190.32950411147868), ('头发\t蓬松', 126.3945117883121), ('味道\t香', 70.7282132914151), ('气味\t香', 62.416323882483624), ('头发\t柔顺', 60.14794005867908), ('性价比\t高', 57.41753087280957), ('包装\t好', 56.420237481209476), ('质量\t好', 42.71421457597844), ('洗完\t蓬松', 35.73685531973345), ('洗发水\t好', 32.111506470088514), ('头发\t清爽', 30.865528643735292), ('洗发水\t香', 29.868033807404387), ('味道\t好', 25.85738378159592), ('洗发水\t不错', 25.655257906629966), ('效果\t不错', 25.132216990883926), ('头皮\t清爽', 25.09434917103873), ('产品\t好', 23.072515202170752), ('洗完\t清爽', 22.99379760032052), ('感觉\t不错', 22.246410141486567), ('洗完\t柔顺', 21.280419614157775)]
