### dependence ###

In [1]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import hierarchical
from sklearn.cluster import estimate_bandwidth
from sklearn.metrics import calinski_harabaz_score
from sklearn.metrics import silhouette_score
from jieba.analyse import *
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# from Bio.Cluster import kcluster
# import synonyms
import pandas as pd
import jieba
jieba.add_word("报警人", freq=56, tag='nr')
jieba.add_word("系", freq=56, tag='r')
from math import isnan
import math
import numpy as np
import re
import csv
import jieba.posseg as pseg
import gensim

Building prefix dict from the default dictionary ...
Loading model from cache E:\local_temp\jieba.cache
Loading model cost 0.908 seconds.
Prefix dict has been built succesfully.


### main function ###

In [2]:
n_clusters = 24

def deal_sklearn_model(model,
                       weight,
                       data,
                       df,
                       X,
                       print_details=0,
                       distance='euclidean'):
    kind = df[1].tolist()
    if print_details:
        print(model.labels_)
        print(len(model.labels_))
        details = {}
        for i in range(len(data)):
            if model.labels_[i] not in details:
                details[model.labels_[i]] = [[data[i]], [kind[i]]]
            else:
                details[model.labels_[i]][0].append(data[i])
                details[model.labels_[i]][1].append(kind[i])
        number = 0
        for key, list in details.items():
            this_kind = 0
            print("第%d类" % key)
            for i in range(len(list[0])):
                this_kind = this_kind + 1
                print("属于", list[1][i], ' ', list[0][i])
            print(this_kind)
            print()
            print()
            print()
            number += this_kind
        print("all_number == ", number)
    check_result(weight, model, distance)


    # K-Means 算法
def K_Means(weight, data, df, X, print_details=0):
    print("K_Means算法")
    K_number = 16
    iter_number = 700
    kmodel = KMeans(n_clusters=n_clusters,
                    n_init=10,
                    max_iter=iter_number,
                    init='k-means++',
                   n_jobs=-1).fit(weight)
    deal_sklearn_model(kmodel, weight, data, df, X, print_details)
    return kmodel.labels_


# DBSCAN 算法
def AAedion_DBSCAN(weight, data, df, X, print_details=0, distance='euclidean'):
    print("DBSCAN算法")
    DBmodel = DBSCAN(eps=0.5, min_samples=10, metric=distance,n_jobs=-1).fit(weight)
    deal_sklearn_model(DBmodel, weight, data, df, X, print_details, distance)
    print("DBSCAN算法完成")
    return DBmodel.labels_


def AAedion_Birch(weight, data, df, X, print_details=0, distance='euclidean'):
    print('Birch算法')
    Birch_model = Birch(n_clusters=n_clusters).fit(weight)
    deal_sklearn_model(
        Birch_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Birch_model.labels_


def AAedion_SpectralClustering(weight,
                               data,
                               df,
                               X,
                               print_details=0,
                               distance='euclidean',n_jobs=-1):
    print('SpectralClustering算法')
    Spe_model = SpectralClustering(n_clusters=n_clusters).fit(weight)
    deal_sklearn_model(
        Spe_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Spe_model.labels_


def AAedion_AffinityPropagation(weight,
                                data,
                                df,
                                X,
                                print_details=0,
                                distance='euclidean'):
    print('AffinityPropagatio算法')
    Aff_model = AffinityPropagation(damping=0.5,
                                    max_iter=200,
                                    convergence_iter=15,
                                    copy=True,
                                    preference=None,
                                    affinity='euclidean',
                                    verbose=False).fit(weight)
    deal_sklearn_model(
        Aff_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Aff_model.labels_


def AAedion_Meanshift(weight,
                      data,
                      df,
                      X,
                      print_details=0,
                      distance='euclidean'):
    print('MeanShift算法')
#     bandwidth = estimate_bandwidth(weight, quantile=0.3,n_jobs=-1)
#    print(bandwidth)
    MeanShift_model = MeanShift(bandwidth=1,
                                seeds=None,
                                bin_seeding=True,
                                min_bin_freq=1,
                                cluster_all=False,
                                n_jobs=-1).fit(weight)
    print('Meanshfit finish')
    deal_sklearn_model(
        MeanShift_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return MeanShift_model.labels_


def check_result(weight, model, distance):
    #print("共%d类" %model.labels_)
    print("result of harabaz_score: ",
          calinski_harabaz_score(weight, model.labels_))
    print("result of silhouette_score: ",
          silhouette_score(weight, model.labels_, metric='euclidean'))


def print_TF_IDF_values():
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重
        print("-------这里输出第", i, u"类文本的词语tf-idf权重------")
        # for j in range(len(word)):
        print(list(zip(all_word, weight[i])))

### stop word ###

In [3]:
    # 人工停用词表
    special_word = ['称', '，', '。', '在', '年', '月', '(', '（','民警','报警','观音桥','称有','称一','称车','称其','接警','人称']

    # 加载停用词表
    Stopword_FilePath = "../stopWord.txt"
    stopWordList = pd.read_csv(Stopword_FilePath, sep="\r")
    for word in special_word:
        stopWordList.loc[stopWordList.size] = word
    print("停用词表的大小是", stopWordList.size)

    # 加载数据
    FilePath = "../data.csv"
    df = pd.read_csv(FilePath,header=None)

    # 停用词过滤
    data = df[5]
    kind = df[2]
    X = []
    delete = []
    record = {}
    person_number = 0
    place_number = 0
    pronoun_number = 0
    beside_number = 0
    all_number = 0
    flag_record = {}
    flag_form = {}
    flag_result = {}
    data_keywords = []
#     data_new = []
#     kind_new = []
#     k = 0
#     for i in range(len(data)):
#         try:
#             if isnan(float(data[i])):
#                 k += 1
#         except ValueError:    
#              data_new.append(data[i])
#              kind_new.append(kind[i])
#     data = data_new
#     kind = kind_new
    for i in range(len(data)):
        cut_result = []
        person_name = []
        place_name = []
        pronoun = []
        beside = []
        # 按词性过滤
        #print(data[i])
        k = pseg.cut(data[i])
        for word, flag in k:
            flag_form[word] = flag
            if flag not in flag_record:
                flag_record[flag] = 1
            else:
                flag_record[flag] += 1
                
            if flag == 'nr' or flag == 'nr1' or flag == 'nr2':
                person_name.append(word)
                person_number += 1
                continue
            if flag == 'ns' or flag == 'nz':
                place_name.append(word)
                place_number += 1
                continue
            if flag == 'r' or flag == 'rr' or flag == 'rz':
                pronoun.append(word)
                pronoun_number += 1
                continue
            if flag == 'w' or flag == 'h' or flag == 'k' or flag == 'xx' \
                    or flag == 'o' or flag == 'u' or flag == 'm' or flag == 'd' \
                    or flag == 'f':
                beside.append(word)
                beside_number += 1
                continue
        # 提取关键词列表
        for keyword, weight in extract_tags(data[i], withWeight=True):
            if keyword in person_name or keyword in place_name or keyword in pronoun or keyword in beside:
                all_number += 1
                delete.append(keyword)
                continue
            # if flag_form[keyword] not in flag_result:
            #     flag_result[flag_form[keyword]] = [keyword]
            # else:
            #     flag_result[flag_form[keyword]].append(keyword)
            cut_result.append(keyword)
        # print(data[i],": 关键词",cut_result)

        # 用停用词库筛选关键词列表
        temp = ""
        for j in cut_result:
            if j in stopWordList.values or re.search("[0-9]+", j) is not None \
                    or re.search("[a-z]+", j) is not None or re.search("[A-Z]+", j) is not None:
                if j not in delete:
                    delete.append(j)
            else:
                if temp == '':
                    temp = j
                else:
                    temp = temp + " " + j
                if j in record:
                    record[j] = record[j] + 1
                else:
                    record[j] = 1
        X.append(temp)
        data_keywords.append(temp)
    flag_record = sorted(flag_record.items(), key=lambda item: item[1], reverse=True)
    result_X = []
    for string in X:
        temp = ''
        for j in string.split(' '):
            if j == '':
                continue
            if record[j] < 5:
                continue
            temp = temp + " " + j
            # print(j)
        result_X.append(temp)
    X_str = np.array(result_X)
    record = sorted(record.items(), key=lambda item: item[1], reverse=True)
    print('finish')
#     print("被清除的词有", delete)
#     print("最终词典", record)
#     print("过滤结果", X_str)
#     print('person_number: ', person_number)
#     print('place_number: ', place_number)
#     print('pronoun_number: ', pronoun_number)
#     print('beside_number: ', beside_number)
#     print('all_number: ', all_number)
#     print('flag_record: ', flag_record)

    # for w, f in flag_result.items():
    #     print('%s类' % w)
    #     for str in f:
    #         print(str)
    #     print()
    #     print()
    #     print()

停用词表的大小是 2085
finish


### construction TF-IDF ###

#### synonym word replace ####

In [4]:
wx_from_text = gensim.models.KeyedVectors.load_word2vec_format('E:/chrome_download/2000000tencent.txt')
model = wx_from_text.wv

  


In [5]:
# preprocessing
data = result_X
synonym_map = {}
no_synonym_words_count = 0
for i in range(len(data)):
    for string in data[i].split(' '):
        if string not in synonym_map:
            try:
                synonym_list = model.most_similar(string)
                for item in synonym_list:
                    synonym_word = item[0]
                    if synonym_word not in synonym_map:
                        synonym_map[synonym_word] = string
#                         print('add ',synonym_word,'to ',string)
            except Exception as e:
                    no_synonym_words_count += 1
#                 print('no word ',string)
# replace
result = []
for i in range(len(data)):
    temp = ""
    for string in data[i].split(' '):
        if string in  synonym_map:
            replace_string = synonym_map[string]
            print(string," change to ",replace_string)
        else:
            replace_string = string
        if temp != "":
            temp = temp + " " + replace_string
        else:
            temp = replace_string
    result.append(temp)
X_str = np.array(result)

被困  change to  困在
水里  change to  水中
被困  change to  困在
水里  change to  水中
被困  change to  困在
水里  change to  水中
被困  change to  困在
水里  change to  水中
水里  change to  水中
被困  change to  困在
被困  change to  困在
导致  change to  引发
导致  change to  引发
导致  change to  引发
导致  change to  引发
被困  change to  困在
被困  change to  困在
电话  change to  回拨
黑色  change to  红色
盗走  change to  被盗
电话  change to  回拨
偷走  change to  丢失
偷走  change to  丢失
偷走  change to  丢失
警察  change to  警察局
派出所  change to  报案
派出所  change to  报案
警察  change to  警察局
派出所  change to  报案
派出所  change to  报案
带回  change to  带走
警察  change to  警察局
派出所  change to  报案
派出所  change to  报案
警察  change to  警察局
派出所  change to  报案
派出所  change to  报案
警察  change to  警察局
警方  change to  报案
派出所  change to  报案
宾馆  change to  酒店
派出所  change to  报案
派出所  change to  报案
派出所  change to  报案
派出所  change to  报案
派出所  change to  报案
派出所  change to  报案
派出所  change to  报案
出租车  change to  公交车
车辆  change to  车未
车辆  change to  车未
纠纷  change to  起纠纷
车主  change to  挪车
纠纷  change to  起纠纷
车主 

电话  change to  回拨
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
服务员  change to  前台
纠纷  change to  起纠纷
砸伤  change to  砸坏
砸伤  change to  砸坏
伤者  change to  家属
送往  change to  带回
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
摔伤  change to  摔倒
摔伤  change to  摔倒
摔伤  change to  摔倒
纠纷  change to  起纠纷
昨天下午  change to  昨晚
纠纷  change to  起纠纷
摔伤  change to  摔倒
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
收到  change to  接到
报案  change to  报过警
警方  change to  报案
地址  change to  具体地址
查询  change to  查看
警方  change to  报案
收到  change to  接到
女子  change to  男子
殴打  change to  辱骂
砸伤  change to  砸坏
十多人  change to  十几人
送往  change to  带回
带回  change to  带走
伤者  change to  家属
持刀  change to  持械
伤者  change to  家属
一人  change to  无人
带到  change to  带回
情况不明  change to  不明
持刀  change to  持械
上午  change to  下午
情况不明  change to  不明
持刀  change to  持械
伤者  change to  家属
带到  change to  带回
送医  change to  生命危险
持刀  change to  持

纠纷  change to  起纠纷
乘客  change to  旅客
纠纷  change to  起纠纷
出租车  change to  公交车
车上  change to  下车
司机  change to  驾驶员
纠纷  change to  起纠纷
上车  change to  开走
打人  change to  打架
乘客  change to  旅客
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
纠纷  change to  起纠纷
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
纠纷  change to  起纠纷
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
乘客  change to  旅客
出租车  change to  公交车
警察  change to  警察局
司机  change to  驾驶员
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
喝醉酒  change to  醉酒
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
出租车  change to  公交车
乘客  change to  旅客
司机  change to  驾驶员
纠纷  change to  起纠纷
出租车  change to  公交车
纠纷  change to  起纠纷
纠纷  change to  起纠纷
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
派出所  change to  报案
出租车  change to  公交车
车上  change to  下车
司机  change to  驾驶员
付钱  change to  付款
车主  change to  挪车
纠纷  change to  起纠

争吵  change to  口角
担心  change to  怀疑
治安案件  change to  刑事案件
挂断  change to  电话
情况  change to  具体情况
纠纷  change to  起纠纷
喝了酒  change to  醉酒
打人  change to  打架
带回  change to  带走
女子  change to  男子
女子  change to  男子
不清  change to  不明
一人  change to  无人
担心  change to  怀疑
纠纷  change to  起纠纷
一人  change to  无人
司机  change to  驾驶员
不清  change to  不明
楼下  change to  门口
女子  change to  男子
前男友  change to  男友
挂断  change to  电话
地址  change to  具体地址
一人  change to  无人
殴打  change to  辱骂
两人  change to  三人
挂断  change to  电话
电话  change to  回拨
不清  change to  不明
纠纷  change to  起纠纷
情况  change to  具体情况
无需  change to  不需
店内  change to  店里
砸坏  change to  打坏
堵住  change to  堵塞
不清  change to  不明
带回  change to  带走
纠纷  change to  起纠纷
持刀  change to  持械
喝醉酒  change to  醉酒
纠纷  change to  起纠纷
电话  change to  回拨
打人  change to  打架
争吵  change to  口角
殴打  change to  辱骂
纠纷  change to  起纠纷
酒瓶  change to  啤酒瓶
女子  change to  男子
担心  change to  怀疑
争吵  change to  口角
不清  change to  不明
已告  change to  即告
儿子  change to  女儿
不清  change to  不明
电话  cha

打麻将  change to  打牌
腊肉  change to  香肠
妻子  change to  老婆
屋内  change to  房间内
纠纷  change to  起纠纷
警察  change to  警察局
黑色  change to  红色
一人  change to  无人
司机  change to  驾驶员
走失  change to  走散
车牌  change to  牌照
昨天  change to  昨晚
电话  change to  回拨
车主  change to  挪车
骗子  change to  行骗
拿走  change to  带走
物管  change to  业主
车主  change to  挪车
白色  change to  红色
遗失  change to  丢失
身份证  change to  证件
车位  change to  车库
纠纷  change to  起纠纷
被困  change to  困在
人员  change to  工作人员
被困  change to  困在
将车  change to  车停
纠纷  change to  起纠纷
一人  change to  无人
纠纷  change to  起纠纷
行骗  change to  冒充
纠纷  change to  起纠纷
一人  change to  无人
纠纷  change to  起纠纷
门外  change to  门口
诈骗  change to  行骗
儿子  change to  女儿
电话  change to  回拨
人员  change to  工作人员
纠纷  change to  起纠纷
纠纷  change to  起纠纷
骗钱  change to  行骗
挂断  change to  电话
要钱  change to  借钱
昨天  change to  昨晚
宾馆  change to  酒店
女子  change to  男子
担心  change to  怀疑
交通事故  change to  事故
纠纷  change to  起纠纷
交通事故  change to  事故
纠纷  change to  起纠纷
纠纷  change to  起纠纷
车牌  change to  牌照
一人  

打人  change to  打架
现该  change to  现需
乘客  change to  旅客
喝醉酒  change to  醉酒
车上  change to  下车
一人  change to  无人
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
一人  change to  无人
车上  change to  下车
司机  change to  驾驶员
乘客  change to  旅客
送往  change to  带回
出租车  change to  公交车
司机  change to  驾驶员
一人  change to  无人
转警  change to  报称
女子  change to  男子
情况  change to  具体情况
纠纷  change to  起纠纷
一人  change to  无人
人行道  change to  马路
一人  change to  无人
情况  change to  具体情况
人行道  change to  马路
一人  change to  无人
情况  change to  具体情况
人行道  change to  马路
一人  change to  无人
不清  change to  不明
路边  change to  路过
男性  change to  女性
已告  change to  即告
一人  change to  无人
情况  change to  具体情况
已告  change to  即告
酒醉  change to  醉酒
医生  change to  医院
遗失  change to  丢失
身份证  change to  证件
外地  change to  外地人
女子  change to  男子
女子  change to  男子
女子  change to  男子
称系  change to  称因
乘客  change to  旅客
出租车  change to  公交车
司机  change to  驾驶员
不清  change to  不明
乘客  change to  旅客
不清  change to  不明
女生  change to  女孩
车主  change to  挪车
喝醉酒

详细  change to  具体情况
不清  change to  不明
地址  change to  具体地址
详细  change to  具体情况
称系  change to  称因
不清  change to  不明
派出所  change to  报案
地址  change to  具体地址
详细  change to  具体情况
电话  change to  回拨
声音  change to  听见
卖淫  change to  卖淫嫖娼
挂断  change to  电话
纠纷  change to  起纠纷
地址  change to  具体地址
挂断  change to  电话
接听  change to  回拨
不清  change to  不明
电话  change to  回拨
挂断  change to  电话
电话  change to  回拨
挂断  change to  电话
电话  change to  回拨
挂断  change to  电话
电话  change to  回拨
女子  change to  男子
遗失  change to  丢失
出租车  change to  公交车
出租车  change to  公交车
出租车  change to  公交车
敲诈  change to  绑架
一人  change to  无人
出租车  change to  公交车
车上  change to  下车
车主  change to  挪车
车上  change to  下车
车上  change to  下车
出租车  change to  公交车
车牌  change to  牌照
不清  change to  不明
出租车  change to  公交车
遗失  change to  丢失
出租车  change to  公交车
昨天  change to  昨晚
遗失  change to  丢失
电话  change to  回拨
出租车  change to  公交车
身份证  change to  证件
身份证  change to  证件
出租车  change to  公交车
拿走  change to  带走
出租车  change to  公交车
出租车  change to  公交车
出租车  c

出租车  change to  公交车
遗失  change to  丢失
车牌  change to  牌照
不清  change to  不明
出租车  change to  公交车
昨天  change to  昨晚
出租车  change to  公交车
出租车  change to  公交车
遗失  change to  丢失
报案  change to  报过警
车牌  change to  牌照
不清  change to  不明
出租车  change to  公交车
身份证  change to  证件
下车时  change to  下车
出租车  change to  公交车
遗失  change to  丢失
拿走  change to  带走
人行道  change to  马路
纠纷  change to  起纠纷
纠纷  change to  起纠纷
担心  change to  怀疑
医生  change to  医院
人员  change to  工作人员
医生  change to  医院
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
医生  change to  医院
医生  change to  医院
纠纷  change to  起纠纷
喝醉酒  change to  醉酒
打人  change to  打架
医生  change to  医院
纠纷  change to  起纠纷
出警  change to  警情
纠纷  change to  起纠纷
女朋友  change to  男友
纠纷  change to  起纠纷
纠纷  change to  起纠纷
纠纷  change to  起纠纷
情况  change to  具体情况
早上  change to  下午
遗失  change to  丢失
女子  change to  男子
住户  change to  居民
女性  change to  女性朋友
纠纷  change to  起纠纷
高音喇叭  change to  喇叭
纠纷  change to  起纠纷
导致  change to  引发
麻将  change to  打牌
麻将  change to  打牌
货车  cha

派出所  change to  报案
女友  change to  男友
女朋友  change to  男友
纠纷  change to  起纠纷
担心  change to  怀疑
宾馆  change to  酒店
挂断  change to  电话
小女孩  change to  小孩
接听  change to  回拨
地址  change to  具体地址
橙色  change to  红色
姐姐  change to  弟弟
担心  change to  怀疑
裤子  change to  衣服
黑色  change to  红色
回家  change to  回来
电话  change to  回拨
担心  change to  怀疑
反锁  change to  门锁
酒醉  change to  醉酒
房门  change to  敲门
开门  change to  关门
担心  change to  怀疑
男性  change to  女性
担心  change to  怀疑
女儿  change to  母亲
接电话  change to  来电
地址  change to  具体地址
担心  change to  怀疑
妹妹  change to  女儿
外出  change to  出门
地址  change to  具体地址
出租车  change to  公交车
担心  change to  怀疑
寻找  change to  找到
蓝色  change to  红色
走失  change to  走散
裤子  change to  衣服
黑色  change to  红色
儿子  change to  女儿
出警  change to  警情
电话  change to  回拨
地址  change to  具体地址
男性  change to  女性
黑色  change to  红色
前女友  change to  男友
男孩  change to  女孩
下车时  change to  下车
儿子  change to  女儿
带回  change to  带走
走失  change to  走散
前男友  change to  男友
女朋友  change to  男友
电话  change to  回拨
担心  chang

#### construction TF-IDF based on synonym words ####

In [6]:
vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
X = vectorizer.fit_transform(X_str)  # 将文本转为词频矩阵
tfidf = transformer.fit_transform(X)  # 计算tf-idf，
all_word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重

# print_TF_IDF_values()
print("weight shape:", weight.shape)
print("word shape:", len(all_word))

weight shape: (6481, 591)
word shape: 591


### Cluster ###

#### cluster ####

In [7]:
def clustering(print_details,weight,DB):
    KMeans_labels = K_Means(weight, data, df, X, print_details)
#     if DB == 1:
#         DS_labels = AAedion_DBSCAN(weight, data, df, X, print_details)
#     else:
#         DS_labels = AAedion_SpectralClustering(
#         weight, data, df, X, print_details)
    Birch_labels = AAedion_Birch(weight, data, df, X, print_details)
    all_labels = [
        KMeans_labels, #DS_labels, 
        Birch_labels
    ]
    #print(all_labels[0][1])
    return all_labels


# KMeans_labels = K_Means(weight, data, df, X, print_details)
# DBSCAN_labels = AAedion_DBSCAN(weight, data, df, X, print_details)
# Brich_labels = AAedion_Birch(weight, data, df, X, print_details)
# SpectralClustering_labels = AAedion_SpectralClustering(weight, data, df, X, print_details)

### can't use 

# AAedion_AffinityPropagation(weight, data, df, X, 1)
# AAedion_Meanshift(weight, data, df, X, 1)

#### change weight ####

In [8]:
def deal(all_labels):
    cluster_result = []
    for i in range(0, len(all_labels[0])):
        cluster_result.append('')
    for labels in all_labels:
        for i in range(0, len(labels)):
            cluster_result[i] = cluster_result[i] + " " + str(labels[i])
    return cluster_result


def Change(all_labels, weight):
    cluster_result = deal(all_labels)
    cluster_map = {}
    for i in range(len(cluster_result)):
        string = cluster_result[i]
        if string in cluster_map:
            cluster_map[string][0].append(data[i])
            cluster_map[string][1].append(i)
            cluster_map[string][2] += 1
        else:
            cluster_map[string] = [[data[i]], [i], 1]
    
    cluster_map = sorted(cluster_map.items(),
                         key=lambda item: item[1][2],
                         reverse=True)
    print("聚类总情况数: ", len(cluster_map))
    # for item in cluster_map:
    #     print("所属:  ", item[0], "数量： ", item[1][2])
    goal_words = {}
    for i in range(0, len(cluster_map) // 2):
        #         if cluster_map[i][1][2] < len(data)/40:
        #             continue
        print(i, cluster_map[i][1][2])
        wordcount_record = {}
        for item in cluster_map[i][1][1]:
            #     print(data_keywords[item])
            for word in data_keywords[item].split(" "):
                if word == ' ' or word == '':
                    continue
                if word in wordcount_record:
                    wordcount_record[word] += 1
                else:
                    wordcount_record[word] = 1
        wordcount_record = sorted(wordcount_record.items(),
                                  key=lambda item: item[1],
                                  reverse=True)
        # increase the first and second word
        alpha = 1.5
        for item in wordcount_record[0:2]:
            increase_rate = 1 + alpha * (item[1] / cluster_map[i][1][2])
            print(item[0], " ", item[1]," increase rate: ",increase_rate)
            if item[0] not in goal_words:
                weight = changeWeight(weight, all_word, item[0],
                                      1 + alpha * (item[1] / cluster_map[i][1][2]),
                                      0)
                goal_words[item[0]] = 1
    return weight

def Change_Chi(all_labels, weight):
    types = deal(all_labels) #类list
    corpus = data #数据类
    words = all_word #特征词
    
    cluster_result = types
    cluster_map = {}
    for i in range(len(cluster_result)):
        string = cluster_result[i]
        if string in cluster_map:
            cluster_map[string][0].append(data[i])
            cluster_map[string][1].append(i)
            cluster_map[string][2] += 1
        else:
            cluster_map[string] = [[data[i]], [i], 1]
    
    cluster_map = sorted(cluster_map.items(),
                         key=lambda item: item[1][2],
                         reverse=True)
    print("聚类总情况数: ", len(cluster_map))
    print("最大类的样本数为：", cluster_map[0][1][2])
    
    types_words_ratio = {}
    types_words_N = len(corpus)
    types_words_A = {} #属于某类别ci也含有特征词的文本数目
    types_words_B = {} #不属于某类别ci也含有特征词的文本数目
    types_words_C = {} #属于某类别ci但不含有特征词的文本数目
    types_words_D = {} #不属于某类别ci也不含有特征词的文本数目
    
    for i in range(len(cluster_map)):
        t = cluster_map[i][0]
        for w in words:
            pair = t + w;
            for i in range(len(corpus)):
                if types[i] == t and w in corpus[i]:
                    types_words_A[pair] = types_words_A.get(pair, 0) + 1
                    continue
                if types[i] != t and w in corpus[i]:
                    types_words_B[pair] = types_words_B.get(pair, 0) + 1
                    continue
                if types[i] == t and w not in corpus[i]:
                    types_words_C[pair] = types_words_C.get(pair, 0) + 1
                    continue
                if types[i] != t and w not in corpus[i]:
                    types_words_D[pair] = types_words_D.get(pair, 0) + 1
            
    words_suit_types = {} #CHI值所对应的类
    words_suit_marks = {} #CHI值
    words_types_marks = {} #不同类的CHI排序
    for i in range(len(cluster_map)):
        words_types_marks[cluster_map[i][0]] = {}
        
    for word in words:
        for i in range(len(cluster_map)):
            t = cluster_map[i][0]
            pair = t + word
            CHI = types_words_N * math.pow((types_words_A.get(pair, 0) * types_words_D.get(pair, 0) - types_words_B.get(pair, 0) * types_words_C.get(pair, 0)), 2) 
            CHI = CHI / ((types_words_A.get(pair, 0) + types_words_C.get(pair, 0)) * (types_words_B.get(pair, 0) + types_words_D.get(pair, 0)))
            CHI = CHI / ((types_words_A.get(pair, 0) + types_words_B.get(pair, 0)) * (types_words_C.get(pair, 0) + types_words_D.get(pair, 0)))
            if CHI > words_suit_marks.get(word, 0):
                words_suit_marks[word] = CHI
                words_suit_types[word] = t
        types = words_suit_types[word]
        marks = words_suit_marks[word]
        words_types_marks[types][word] = marks
            
    for key in words_types_marks.keys():
        temp = words_types_marks[key]
        if len(temp) == 0:
            continue
        wordcount_record = sorted(temp.items(),
                                  key=lambda item: item[1],
                                  reverse=True)
        # increase the first and second word
        alpha = 0.005
        if len(wordcount_record) >= 2:
            for item in wordcount_record[0:2]:
                increase_rate = 1 + alpha * item[1] / cluster_map[i][1][2]
                print(item[0], " ", item[1]," increase rate: ",increase_rate)
                weight = changeWeight(weight, all_word, item[0], 1 + alpha * (item[1] / cluster_map[i][1][2]),0)
        else:
            item = wordcount_record[0]
            increase_rate = 1 + alpha * item[1] / cluster_map[i][1][2]
            print(item[0], " ", item[1]," increase rate: ",increase_rate)
            weight = changeWeight(weight, all_word, item[0], 1 + alpha * (item[1] / cluster_map[i][1][2]),0)
            
    return weight
      
    

def changeWeight(weight, all_word, word, change_rate, print_details=0):
    change_array = np.eye(weight.shape[1])
    for i in range(len(all_word)):
        if all_word[i] == word:
            break
    change_array[i][i] = change_rate
    weight = np.dot(weight, change_array)
    return weight

#### training ####

In [9]:
print_details = 0
DB = 0
for i in range(0, 7):
    print('第%d次' % (i + 1))
    all_labels = clustering(print_details, weight, DB)
    weight = Change_Chi(all_labels, weight)

第1次
K_Means算法
result of harabaz_score:  126.80520653585495
result of silhouette_score:  0.19671557069525877
Birch算法


  node1_dist, node2_dist = dist[[farthest_idx]]


result of harabaz_score:  35.67003600994674
result of silhouette_score:  0.034283663657529335
聚类总情况数:  188
最大类的样本数为： 1612
消防   170.11897600877742  increase rate:  1.8505948800438872
家庭   152.57918548131093  increase rate:  1.7628959274065545
挡道   6126.361880824741  increase rate:  31.631809404123707
车未   209.4062462777621  increase rate:  2.047031231388811
被盗   4211.929285029677  increase rate:  22.059646425148387
家中   268.58771794132696  increase rate:  2.342938589706635
噪音   4266.800685292567  increase rate:  22.334003426462836
扰民   3057.787015275644  increase rate:  16.288935076378223
医院   456.0697019685182  increase rate:  3.280348509842591
受伤   425.4208129517759  increase rate:  3.1271040647588793
感情   810.7561540192025  increase rate:  5.0537807700960125
小孩   278.7940787058713  increase rate:  2.3939703935293566
丢失   1293.7197584157736  increase rate:  7.468598792078868
手机   838.7512479192085  increase rate:  5.193756239596042
挪车   599.3420309383215  increase rate:  3.99671015469

首饰   214.4017064738788  increase rate:  2.072008532369394
传销   214.40170647387882  increase rate:  2.072008532369394
卖淫嫖娼   79.2815841897885  increase rate:  1.3964079209489424
打人者   257.64123723800253  increase rate:  2.2882061861900125
逃逸   54.610049338449834  increase rate:  1.2730502466922493
人持   257.64123723800253  increase rate:  2.2882061861900125
途中   214.40170647387882  increase rate:  2.072008532369394
称因   261.13875142763175  increase rate:  2.305693757138159
方向   322.5007810229917  increase rate:  2.6125039051149583
损坏   71.94979760020013  increase rate:  1.3597489880010007
房间内   229.85858464473412  increase rate:  2.149292923223671
强行   122.96503027591743  increase rate:  1.6148251513795873
归还   922.4264540461854  increase rate:  5.612132270230927
不予   200.9080289480089  increase rate:  2.0045401447400444
电脑   214.46800256325244  increase rate:  2.0723400128162623
绑定   430.600350696801  increase rate:  3.153001753484005
回来   358.5560969901229  increase rate:  2.7927804849

自称   299.0407503117662  increase rate:  2.4952037515588312
验证码   224.28565600891065  increase rate:  2.121428280044553
配合   570.8803451998174  increase rate:  3.854401725999087
另一人   97.98972165808838  increase rate:  1.489948608290442
失主   6481.0  increase rate:  33.405
暂无   65.72791159573977  increase rate:  1.328639557978699
老板   299.6712999065777  increase rate:  2.4983564995328886
讨薪   153.7568369433218  increase rate:  1.7687841847166088
赔偿   1816.3035045194683  increase rate:  10.081517522597341
事故   127.569434127327  increase rate:  1.6378471706366349
价值   35.012945187586865  increase rate:  1.1750647259379343
食客   614.4765171821971  increase rate:  4.072382585910986
办公室   152.62199052961532  increase rate:  1.7631099526480765
小家   861.5997133015836  increase rate:  5.307998566507918
联行   717.4444467342136  increase rate:  4.587222233671068
透露   214.4017064738788  increase rate:  2.072008532369394
事由   178.3634312911919  increase rate:  1.8918171564559594
火灾   2246.594056772029

地为   358.66693344856935  increase rate:  2.7933346672428465
女性朋友   358.66693344856935  increase rate:  2.7933346672428465
协调   1079.3332047325102  increase rate:  6.396666023662552
称店   1079.3332047325102  increase rate:  6.396666023662552
掉落在   719.222085048011  increase rate:  4.596110425240055
掉落   461.99985670194  increase rate:  3.3099992835097
大门   924.9998677248677  increase rate:  5.624999338624339
吃饭   404.1248553240741  increase rate:  3.0206242766203704
音乐声   1079.3332047325102  increase rate:  6.396666023662552
音乐   497.61524216524214  increase rate:  3.4880762108262107
搬走   809.2498649691358  increase rate:  5.04624932484568
称该   497.61524216524214  increase rate:  3.4880762108262107
第4次
K_Means算法
result of harabaz_score:  13538.999399823024
result of silhouette_score:  0.8015588105337075
Birch算法
result of harabaz_score:  11535.013361384059
result of silhouette_score:  0.812389661988578
聚类总情况数:  42
最大类的样本数为： 3612
朋友   37.91839053905842  increase rate:  1.189591952695292
家庭

楼上   2205.9284832895028  increase rate:  12.029642416447514
称一人   147.07851956686528  increase rate:  1.7353925978343265
砸坏   38.19012264751922  increase rate:  1.1909506132375962
强奸   3824.3529473772983  increase rate:  20.12176473688649
受害者   1135.0307061840772  increase rate:  6.675153530920386
失主   6481.0  increase rate:  33.405
暂无   65.72791159573977  increase rate:  1.328639557978699
舆情   118.30049997419279  increase rate:  1.591502499870964
依规   46.19917067882814  increase rate:  1.2309958533941407
小家   861.5997133015836  increase rate:  5.307998566507918
联行   717.4444467342136  increase rate:  4.587222233671068
色情   495.8465022941798  increase rate:  3.479232511470899
发短信   229.85858464473412  increase rate:  2.149292923223671
同学   460.2147756270322  increase rate:  3.301073878135161
称今   322.5007810229917  increase rate:  2.6125039051149583
音乐声   268.45944174406657  increase rate:  2.342297208720333
食客   268.45944174406657  increase rate:  2.342297208720333
刑警队   1726.39939301

AttributeError: '_CFSubcluster' object has no attribute 'centroid_'

#### print result ####

In [10]:
min_value = 0 
for i in range(30,40):
    kmodel = KMeans(n_clusters=i,
                    n_init=10,
                    max_iter=700,
                    init='k-means++',
                   n_jobs=-1).fit(weight)
    harabaz_value = calinski_harabaz_score(weight,kmodel.labels_)
    silhouette_value = silhouette_score(weight,kmodel.labels_)
    print('final value: ',harabaz_value," n:",i)
    if harabaz_value > min_value:
        min_value = harabaz_value
        labels_ = kmodel.labels_
    
details = {}
kind = df[2].tolist()
for i in range(len(data)):
    if labels_[i] not in details:
        details[labels_[i]] = [[data[i]], [kind[i]],[i]]
    else:
        details[labels_[i]][0].append(data[i])
        details[labels_[i]][1].append(kind[i])
        details[labels_[i]][2].append(i)

        
with open("输出.csv","w",encoding="utf-8",newline="") as datacsv:
#dialect为打开csv文件的方式，默认是excel，delimiter="\t"参数指写入的时候的分隔符
    csvwriter = csv.writer(datacsv,dialect = ("excel"))
    #csv文件插入一行数据，把下面列表中的每一项放入一个单元格（可以用循环插入多行）
    for key, list in details.items():
        for i in range(len(list[0])):
            csvwriter.writerow([key, list[1][i], list[0][i]])
        

final value:  29206.380118191595  n: 30
final value:  29562.68967825002  n: 31
final value:  29769.73511323783  n: 32
final value:  30988.66218110418  n: 33
final value:  31961.804136242394  n: 34
final value:  30797.247071759906  n: 35
final value:  31305.91581852407  n: 36
final value:  33899.58693729119  n: 37
final value:  34651.69605395048  n: 38
final value:  34415.56175589724  n: 39


20

18

15

14

17

21

17

18

14

20

20

14

11

14

11

14

11

8

11

14

11

11

11

11

11

13

13

22

16

28

13

27

13

13

16

16

16

44

45

42

58

43

36

46

46

42

55

46

39

55

36

41

39

47

46

52

58

55

46

55

55

12

12

13

12

12

12

19

9

9

12

12

9

12

12

12

15

12

12

9

12

9

9

9

12

9

9

19

19

12

15

15

9

12

9

12

12

12

15

9

12

9

15

9

9

12

9

9

9

12

9

12

12

12

9

12

13

12

15

15

9

15

9

12

9

12

16

15

20

15

12

9

9

16

9

19

12

16

12

12

15

18

12

13

12

12

9

16

25

16

12

15

12

12

15

9

15

15

27

12

12

15

9

15

12

9

12

18

9

12

9

7

12

21

12

14

12

12

14

15

15

16

14

21

14

12

12

15

13

24

14

14

12

15

21

32

15

12

19

27

21

18

14

21

9

14

12

30

14

23

14

12

14

12

18

12

14

14

21

15

12

24

40

12

15

13

15

12

36

25

12

15

9

12

13

24

16

16

28

12

18

18

16

18

25

21

21

12

12

12

25

18

15

20

15

21

12

18

12

15

15

15

12

12

24

18

18

15

17

15

12

16

12

15

15

18

16

15

12

31

30

27

21

9

16

18

15

15

27

24

27

21

18

18

18

42

27

12

40

16

12

12

29

12

31

33

27

15

21

24

26

10

38

15

22

23

23

12

15

27

9

21

32

34

9

15

9

12

18

12

18

15

12

12

12

12

12

12

15

15

15

18

18

22

27

9

30

27

27

21

31

24

21

38

39

16

10

13

13

33

26

19

10

13

31

19

16

22

13

16

13

16

28

19

13

10

13

19

10

10

13

40

28

18

31

15

24

30

15

27

36

30

27

24

15

24

20

18

24

15

21

18

18

21

20

38

21

28

38

18

27

21

18

21

18

18

24

25

25

24

42

38

32

27

15

15

27

24

27

18

15

26

21

22

21

31

24

30

24

20

25

28

25

17

29

29

25

13

13

22

22

22

22

35

29

22

20

23

23

26

20

20

23

26

30

15

22

12

23

15

18

37

12

17

18

16

19

15

28

22

12

16

22

19

16

31

10

14

12

13

10

9

46

20

19

31

30

15

38

22

25

35

32

21

19

16

16

13

16

31

25

29

36

23

29

22

25

31

36

29

23

13

24

15

21

12

12

12

23

12

21

12

12

12

12

12

12

31

12

12

12

15

15

21

12

26

15

12

12

12

15

9

30

12

12

12

15

12

19

12

18

32

15

19

15

25

15

12

12

12

24

31

26

12

12

35

15

12

12

19

12

9

15

15

21

13

15

19

15

16

12

18

12

24

12

12

15

15

19

15

15

15

15

30

15

21

12

19

16

19

24

24

19

24

27

21

26

21

21

15

15

15

16

28

28

46

41

44

35

41

50

45

48

27

12

18

12

9

18

31

21

18

25

12

15

15

15

33

24

40

35

37

34

37

35

40

46

52

46

26

23

20

14

14

23

39

34

36

43

29

23

34

44

36

36

40

27

26

29

30

27

32

40

39

37

40

36

33

39

33

29

29

35

27

27

33

20

27

33

33

23

31

36

40

31

32

26

29

32

24

32

20

30

29

46

31

33

30

36

35

40

36

30

12

12

14

12

12

15

12

15

12

23

12

12

12

12

18

15

14

17

22

9

27

14

19

13

12

15

12

14

15

18

14

33

12

19

44

14

14

13

21

29

16

31

19

15

22

15

12

24

30

12

15

21

42

12

31

12

35

15

32

32

16

12

18

30

18

28

18

19

24

19

12

12

45

15

20

18

18

15

30

24

26

31

21

31

37

25

16

16

18

22

21

21

12

34

15

27

12

27

15

27

19

12

19

25

28

19

12

16

23

20

18

18

29

35

18

27

24

33

30

20

24

33

9

15

15

15

9

21

36

12

27

12

30

12

15

18

12

14

14

12

19

12

15

12

15

15

21

15

12

15

24

15

15

12

15

15

12

15

15

15

21

15

15

30

15

15

21

14

12

15

24

12

18

15

15

18

12

18

15

25

12

15

15

15

15

18

15

12

12

12

18

15

30

21

12

24

27

21

17

24

15

15

12

12

15

15

12

21

15

12

15

21

15

12

15

12

21

21

21

39

15

15

12

15

15

15

21

12

18

21

15

21

15

15

14

15

15

15

15

12

12

15

15

18

12

18

18

30

18

23

12

15

9

9

12

9

16

30

24

30

30

21

30

34

16

24

31

23

16

16

15

22

12

12

16

12

12

22

27

12

28

12

12

12

12

12

25

24

12

12

37

22

18

12

15

12

16

13

20

12

22

19

12

30

12

12

12

12

28

12

28

12

34

37

25

32

28

12

13

12

12

25

32

28

16

25

18

12

16

30

12

28

12

12

12

15

12

19

24

12

18

12

14

12

12

28

12

21

12

46

21

25

12

12

28

12

15

25

29

19

18

12

12

37

10

12

17

12

12

12

12

28

16

23

12

36

19

22

32

19

12

16

12

21

25

16

27

12

25

16

32

25

27

12

27

22

18

27

21

18

18

15

18

24

24

21

16

12

38

34

21

15

18

20

21

12

15

18

15

21

15

21

18

32

9

12

21

12

15

15

15

18

15

15

16

21

25

21

27

24

16

18

24

20

18

15

22

21

18

25

22

18

16

13

16

20

13

22

16

20

16

19

15

15

12

21

15

27

15

18

12

18

18

24

17

20

21

17

27

24

20

18

18

9

15

20

15

15

17

15

15

15

12

15

17

12

12

20

15

15

9

17

23

17

12

21

21

24

12

9

26

12

18

12

25

22

21

18

12

15

25

21

18

21

12

18

15

17

14

16

12

12

18

17

23

10

27

21

25

18

18

20

18

18

17

15

19

15

12

17

33

25

21

18

15

18

24

15

18

15

18

20

21

33

18

21

12

15

21

12

21

21

18

21

18

18

27

18

38

18

15

21

15

21

12

42

18

15

15

33

18

38

21

9

12

12

28

20

40

15

27

15

12

9

12

12

9

15

19

22

9

12

9

12

31

12

27

12

15

13

28

9

9

9

13

9

12

15

21

15

25

15

30

18

24

15

9

19

15

9

9

16

9

9

19

9

29

20

9

9

24

24

15

21

24

12

15

24

27

12

12

18

18

12

26

9

9

19

18

12

15

12

21

12

15

12

24

22

12

20

15

28

16

46

41

12

15

12

15

9

15

9

9

21

15

9

9

18

9

9

22

12

15

21

18

18

9

9

15

26

12

9

12

28

9

12

9

9

9

15

21

15

9

35

9

21

20

12

9

9

18

9

21

9

12

12

15

40

16

12

28

28

18

9

12

9

21

21

33

21

21

18

38

22

19

18

21

21

25

22

24

12

10

13

19

27

18

34

9

16

18

13

15

37

24

12

20

9

12

12

18

19

32

9

15

10

19

12

18

19

24

12

10

10

13

19

15

12

10

10

10

12

13

24

10

15

17

12

12

12

12

15

21

27

15

28

45

33

15

15

24

18

35

33

19

21

34

15

15

21

24

20

18

27

18

44

36

41

18

9

18

15

12

25

21

24

24

17

15

18

19

18

18

22

18

12

21

24

15

16

21

24

27

16

21

12

27

15

24

12

10

27

35

16

15

18

21

12

18

20

32

17

27

27

27

27

25

14

12

14

14

14

15

20

23

18

52

52

37

33

30

43

52

37

39

42

45

43

32

33

29

15

33

19

21

22

21

40

34

38

40

25

24

19

22

17

27

15

17

28

22

31

33

21

15

24

23

18

14

30

18

20

29

15

19

20

17

17

13

13

26

17

21

24

16

40

32

33

28

21

42

9

15

16

16

12

12

30

33

19

35

12

30

30

24

27

40

24

22

40

49

18

33

15

28

30

30

15

29

15

22

30

12

15

21

48

16

30

38

26

34

40

24

31

42

46

42

13

20

30

35

22

30

16

13

15

24

27

42

21

22

17

43

28

17

20

20

17

45

26

15

39

30

30

24

36

37

45

16

30

15

15

12

15

12

15

14

12

36

21

22

12

18

12

18

16

18

12

12

12

24

15

20

15

15

24

15

18

12

15

15

12

25

20

24

12

21

15

12

39

15

12

27

43

12

14

30

15

27

12

12

12

18

16

15

25

15

12

15

15

18

19

32

15

12

18

24

9

24

9

15

18

15

12

12

11

12

23

15

18

12

15

18

12

9

30

9

21

15

18

18

22

27

15

12

18

12

46

19

13

25

21

13

19

13

19

16

19

13

22

36

16

19

19

16

13

19

16

16

19

13

21

12

14

9

9

12

9

15

12

21

12

17

21

18

18

18

33

35

12

15

15

15

15

12

14

26

23

12

22

15

15

20

9

12

12

15

21

21

12

26

21

15

12

21

27

18

9

12

18

17

15

12

24

12

20

12

18

18

24

12

9

32

15

28

27

12

27

30

21

13

15

21

12

15

18

15

24

12

12

19

12

18

12

12

15

23

18

12

39

24

18

28

12

17

17

18

12

18

12

15

21

21

18

18

24

36

26

18

27

16

15

29

25

12

15

12

27

18

12

21

9

21

12

16

15

15

12

12

15

21

16

18

18

15

33

12

12

28

15

21

30

12

27

9

12

39

12

12

15

16

21

21

18

9

15

24

15

12

15

12

9

15

12

12

47

37

29

15

12

18

12

12

22

15

20

15

15

22

18

12

24

15

21

21

21

13

12

12

18

12

12

15

18

18

12

18

12

18

9

31

27

42

21

21

15

15

18

18

12

42

42

21

27

12

12

33

15

15

12

35

45

18

18

28

22

18

12

12

12

18

12

9

12

27

15

28

18

20

12

20

24

12

31

12

12

12

18

12

18

17

15

17

30

27

24

9

31

12

12

21

15

9

18

24

15

9

12

27

24

27

29

15

18

21

9

21

12

15

18

21

15

12

12

12

12

13

33

27

12

27

15

12

35

10

24

15

10

25

27

26

10

12

34

32

19

16

20

15

13

9

10

15

10

12

19

7

15

16

15

15

19

17

33

21

21

19

24

23

10

27

15

16

19

10

21

18

18

10

28

15

14

19

18

23

20

17

9

13

16

16

54

17

10

12

10

15

30

21

15

10

13

23

48

13

16

19

16

13

16

16

13

16

10

13

19

16

13

24

16

16

27

13

13

13

13

10

13

13

13

13

10

17

17

14

17

24

24

26

17

21

24

36

18

18

15

18

15

18

15

18

15

18

27

17

14

17

14

14

26

24

21

27

18

41

15

30

15

27

28

24

19

30

22

39

39

22

33

25

19

18

12

12

12

12

12

12

12

9

12

12

12

9

18

12

22

18

18

19

18

35

13

13

9

16

15

22

13

12

13

12

22

21

9

16

16

9

24

16

12

15

13

13

16

16

15

9

13

9

24

12

7

16

15

9

16

17

12

27

21

18

15

30

30

26

20

15

18

16

16

13

16

19

15

18

15

26

17

9

9

21

16

25

9

15

15

21

12

15

15

18

15

32

9

21

15

24

9

9

12

12

15

12

24

12

19

12

9

15

21

26

12

15

12

9

21

24

12

15

15

18

12

15

19

15

15

18

15

15

12

15

9

18

24

21

21

12

12

12

15

15

18

9

25

12

12

15

15

15

12

15

18

15

12

13

21

12

15

15

12

12

26

12

15

18

15

21

9

15

13

9

10

12

17

15

12

15

21

13

15

21

15

12

18

18

16

15

15

12

15

15

21

12

15

15

15

15

18

16

12

24

27

24

12

16

16

12

15

16

16

12

19

18

31

15

27

19

27

25

26

12

21

18

21

18

28

24

19

18

18

24

18

31

21

21

18

12

15

46

18

37

21

18

15

24

24

18

21

15

15

16

21

21

19

18

21

18

27

19

29

21

13

22

25

18

24

18

27

12

21

24

33

15

24

18

28

31

23

28

18

42

16

27

27

12

12

18

9

18

19

9

22

19

23

18

15

21

18

30

31

15

18

18

24

28

18

21

28

20

15

15

19

21

18

18

19

24

18

24

21

18

18

23

15

18

15

20

21

21

22

22

18

23

35

15

18

27

24

18

24

15

27

18

15

18

24

23

24

15

21

19

19

21

32

24

18

19

21

25

19

15

21

19

30

25

18

26

26

26

19

18

25

47

31

19

21

18

28

21

21

21

32

19

35

25

30

18

18

31

28

16

24

20

21

32

15

21

15

33

18

12

18

25

18

18

22

27

19

18

25

21

19

18

25

25

16

31

29

31

27

36

19

26

21

16

23

37

19

31

13

22

29

29

23

12

12

12

18

30

12

12

18

12

21

27

21

39

18

15

24

15

18

15

9

12

12

18

12

12

24

24

31

27

37

12

15

12

31

12

17

19

12

29

14

15

12

16

12

12

9

12

12

15

12

22

12

12

12

12

21

12

12

12

14

21

15

12

18

12

12

12

12

12

18

21

23

15

12

12

15

12

15

24

18

12

18

27

12

12

15

12

15

15

12

18

12

12

15

9

12

12

12

12

12

21

30

15

12

27

12

35

15

12

19

18

15

15

18

12

15

15

10

24

12

27

9

12

12

15

15

21

15

12

28

14

12

23

18

36

22

24

21

32

24

27

21

20

30

27

27

39

36

21

25

33

25

21

27

15

24

27

47

25

21

21

15

25

21

27

15

21

18

44

27

30

18

17

21

32

18

27

21

15

21

18

26

26

26

19

21

25

18

21

24

18

15

36

30

30

30

31

35

25

21

26

24

31

24

31

21

24

24

17

18

21

21

25

22

19

27

16

23

13

21

29

20

24

24

16

21

26

25

15

13

22

41

22

32

19

21

16

19

34

21

21

29

15

15

20

15

24

20

15

22

21

23

22

18

21

15

15

19

20

9

19

18

18

18

24

23

18

21

35

15

24

23

29

31

20

20

20

20

21

18

12

12

30

15

15

19

18

16

33

22

23

19

31

26

15

23

21

29

15

15

20

31

27

26

12

22

21

26

12

12

20

29

33

12

22

18

23

13

16

24

23

15

18

19

16

12

18

15

20

18

28

22

21

22

32

23

27

15

16

18

26

21

25

24

19

18

15

20

16

18

30

21

12

23

16

17

19

18

24

20

12

25

15

24

21

16

21

22

12

19

21

16

25

25

41

44

50

18

12

16

12

21

22

23

16

15

25

22

15

15

19

20

29

19

9

18

22

45

15

19

15

21

12

33

19

12

18

22

15

27

23

18

20

19

18

22

22

15

28

18

16

15

26

19

18

9

20

27

20

15

22

15

18

18

18

16

18

21

15

15

15

19

26

19

15

18

23

18

24

23

24

21

21

24

29

25

18

16

35

18

47

18

23

19

28

16

22

36

16

18

21

19

19

19

22

21

16

19

12

13

29

20

28

19

28

22

15

15

23

23

18

15

17

26

39

34

12

20

19

18

16

19

19

22

19

31

19

19

21

18

34

15

16

24

25

24

18

24

20

19

22

18

18

22

12

13

29

18

37

17

24

18

12

23

15

16

24

18

20

18

30

18

21

12

20

18

20

16

18

18

30

25

16

19

23

21

15

18

27

24

22

13

18

22

20

15

12

15

28

9

18

22

21

22

18

26

21

18

20

16

16

27

20

21

22

28

24

12

16

24

21

23

13

22

19

51

12

21

19

23

15

27

16

23

21

15

9

19

19

15

17

19

25

15

22

26

23

33

15

21

15

18

13

17

20

30

12

15

15

12

12

18

12

12

9

9

15

21

12

15

33

22

12

19

10

13

16

20

15

14

15

21

20

15

18

17

21

17

15

21

18

15

15

18

23

23

21

17

17

17

17

18

23

20

24

15

15

14

17

17

14

18

15

15

15

15

19

14

20

20

17

17

18

17

21

19

17

23

14

17

17

14

17

17

17

24

23

26

14

17

18

17

20

17

17

23

20

14

16

19

17

19

17

27

19

20

14

26

46

17

17

14

17

17

13

16

14

19

13

13

25

24

12

24

12

25

39

9

9

21

28

12

18

33

35

49

9

26

21

25

28

7

7

38

39

36

27

12

27

21

14

21

12

14

34

12

25

15

9

12

12

15

27

12

15

22

15

18

12

12

25

12

12

24

30

21

36

12

18

17

21

21

14

17

29

29

14

18

21

24

20

17

12

14

12

17

18

21

20

14

23

12

18

21

20

21

14

27

14

21

31

18

26

25

9

35

27

18

21

18

12

21

24

9

21

12

12

12

12

15

18

18

15

18

15

15

18

15

22

29

43

37

28

21

29

25

18

38

21

27

28

24

39

27

33

32

32

30

23

38

24

27

18

21

37

21

26

27

30

27

36

32

21

41

29

20

24

33

18

24

26

24

24

32

27

35

21

31

15

18

24

21

24

33

32

20

23

24

24

32

41

29

15

27

24

24

36

24

24

27

28

24

25

15

22

28

25

39

28

37

21

51

21

14

36

24

24

36

15

19

18

35

35

20

20

30

36

18

41

20

20

39

28

29

38

30

26

14

24

30

47

29

23

19

21

30

21

35

35

27

18

21

15

35

31

27

32

35

33

44

29

38

17

28

27

38

49

36

45

31

32

20

20

20

34

36

35

23

36

37

36

34

20

42

44

18

33

31

26

19

27

17

21

37

17

43

29

12

24

18

18

12

15

21

12

12

9

21

18

9

12

12

24

12

12

12

12

21

15

15

27

12

18

32

18

12

21

19

15

15

27

25

12

36

26

21

24

23

20

20

20

16

16

16

17

27

16

17

19

19

16

28

24

28

19

19

28

22

31

19

28

28

28

28

28

17

25

17

17

17

17

20

17

17

34

31

18

17

20

20

20

23

26

17

20

23

31

26

24

21

20

20

17

20

20

20

21

20

20

17

20

32

20

36

35

34

25

23

25

17

23

19

22

19

25

28

22

19

20

20

17

26

20

26

23

20

29

20

20

23

23

17

36

17

20

16

16

19

16

19

28

22

19

16

22

19

16

25

19

22

22

16

25

22

19

16

30

16

22

16

17

22

16

19

19

19

16

17

23

20

19

40

19

19

16

16

22

16

16

17

18

17

23

26

17

28

17

20

20

32

19

47

29

33

25

22

16

17

17

23

17

17

17

17

17

17

20

20

17

17

17

20

13

16

13

13

13

13

13

16

28

25

16

28

25

31

25

14

14

17

14

17

14

14

14

17

16

14

16

14

13

14

17

17

17

17

20

17

14

17

17

20

17

17

17

17

17

17

14

17

17

20

17

16

13

13

16

16

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

17

14

16

16

16

16

13

16

19

16

16

16

16

19

16

13

16

16

16

16

19

13

19

16

16

13

13

16

13

14

16

16

16

19

13

16

19

17

16

11

17

14

17

17

17

27

29

23

24

17

21

23

16

26

23

25

13

16

13

13

16

23

20

13

22

13

13

16

17

20

13

21

16

22

34

22

17

28

16

26

27

22

13

16

20

33

23

27

21

16

20

19

19

19

30

27

28

16

20

20

16

25

17

16

13

16

16

24

24

24

19

19

16

23

19

16

25

19

22

17

22

26

23

16

47

25

17

20

19

19

19

16

16

16

19

19

19

19

16

17

25

19

20

28

25

20

20

25

25

22

16

19

16

17

25

20

22

23

17

22

20

25

15

20

19

19

22

28

19

28

20

20

22

39

28

17

32

23

17

19

25

22

20

28

28

19

23

22

19

32

26

22

19

19

16

19

20

31

22

16

19

19

16

19

22

22

19

16

19

23

25

16

22

25

19

19

22

34

25

16

22

19

17

16

16

17

20

19

16

16

26

27

34

28

22

24

30

25

19

39

22

34

24

30

24

33

17

16

16

13

16

13

16

16

16

13

13

13

16

16

13

13

16

16

13

16

16

16

16

16

13

13

13

16

16

19

25

14

14

14

14

14

14

17

14

17

14

14

13

17

16

16

19

13

13

16

16

16

13

16

19

16

13

19

16

17

14

13

12

18

18

19

18

15

15

15

15

13

16

27

19

21

18

18

22

20

22

18

21

19

21

19

25

19

18

19

25

22

23

25

22

23

21

21

18

24

22

15

21

19

18

21

21

21

19

24

21

18

18

25

18

27

21

34

24

27

18

22

24

21

18

30

24

27

18

24

24

35

33

31

26

25

25

25

31

25

35

31

16

19

37

22

23

27

21

30

24

40

31

29

28

21

27

27

24

27

21

27

30

38

36

24

40

23

27

27

24

24

27

22

24

39

46

24

48

24

24

27

24

24

24

24

27

29

21

37

22

26

26

33

21

21

27

18

21

22

24

21

37

21

18

25

25

21

35

33

21

43

31

26

21

27

26

21

24

27

42

37

16

16

16

16

16

16

16

16

19

16

16

16

16

16

16

16

16

16

16

16

16

16

16

14

14

16

14

14

14

14

16

14

16

25

23

22

25

22

19

34

29

25

40

28

34

34

20

44

28

33

37

39

33

25

36

25

22

22

35

24

30

22

23

26

22

30

24

43

34

37

25

22

38

23

34

36

28

22

28

29

29

47

28

27

36

27

50

36

26

36

35

41

22

33

22

19

27

23

27

25

40

27

32

25

32

28

29

26

26

24

30

27

28

22

25

30

42

36

22

22

29

22

23

29

25

28

22

28

22

22

30

28

27

22

27

22

26

28

20

31

28

33

27

30

22

41

42

32

22

33

23

23

26

27

29

29

22

31

46

29

36

34

30

30

20

37

28

19

31

32

23

25

22

25

26

22

22

30

25

22

34

47

19

22

25

23

28

30

30

29

19

31

22

20

31

28

25

22

19

23

23

20

22

15

26

22

27

35

22

20

23

20

19

22

35

35

21

45

22

28

31

19

22

29

34

21

37

19

45

29

22

22

22

25

24

29

21

37

19

34

19

25

22

19

19

22

19

34

34

28

19

22

32

28

16

20

19

22

16

19

16

19

16

22

19

19

22

16

19

19

22

19

19

34

19

19

16

22

19

19

17

16

16

19

19

19

16

19

17

22

19

14

19

25

38

22

42

22

37

37

36

31

25

41

33

34

19

30

37

25

22

30

28

19

28

25

48

22

31

56

25

30

20

20

38

53

50

43

22

34

22

28

32

25

34

25

42

25

25

31

30

32

19

42

25

29

39

25

35

28

48

25

26

26

28

37

25

31

28

28

28

25

34

32

22

28

19

31

28

31

25

28

41

25

32

28

19

38

25

19

25

34

19

20

37

23

44

19

36

30

25

18

18

15

15

15

18

15

18

15

18

18

15

15

15

18

15

18

21

15

15

15

18

18

18

18

18

18

18

15

26

29

29

32

28

25

22

22

25

22

20

32

46

28

13

13

13

13

18

18

21

18

21

18

21

18

18

18

21

18

18

18

18

21

18

21

21

21

18

18

18

21

21

21

36

24

37

24

30

24

33

27

24

36

30

21

21

21

38

21

21

27

21

26

27

27

30

24

32

36

36

36

36

21

19

17

17

17

31

43

28

27

26

16

18

19

19

25

29

16

17

39

23

24

20

20

20

16

23

34

37

42

22

22

29

20

16

34

16

34

22

19

19

40

25

27

31

27

28

19

28

27

19

19

23

36

21

21

24

24

23

19

25

25

21

21

24

27

28

21

27

23

21

20

19

22

27

22

24

21

19

24

22

22

23

21

19

22

24

27

23

24

30

23

19

19

22

21

24

21

24

31

25

39

24

22

22

26

19

21

21

21

24

21

30

22

19

23

26

27

21

21

21

22

23

21

19

21

24

21

22

23

21

24

21

21

25

22

30

25

21

21

21

21

31

21

22

26

24

23

29

24

21

21

24

21

21

21

24

21

24

21

25

24

21

21

21

22

24

17

21

21

24

24

22

21

21

21

27

21

25

26

30

27

23

23

17

23

24

21

27

21

24

21

21

21

27

33

24

24

21

21

22

21

21

32

17

18

24

20

18

18

17

21

17

17

21

14

14

14

14

14

17

16

14

16

13

16

13

19

13

13

19

37

19

42

44

47

34

31

28

35

38

41

31

31

31

19

26

16

16

22

31

16

26

16

22

22

26

22

20

28

22

29

33

37

25

16

19

22

13

19

29

13

25

25

28

13

19

16

30

13

31

26

22

16

13

19

13

31

34

13

19

22

32

22

16

31

23

13

21

16

22

22

19

19

16

19

28

19

16

16

17

23

22

13

16

16

19

19

16

16

28

22

34

24

22

29

19

35

28

22

19

22

19

32

25

18

29

16

27

14

14

21

27

17

13

14

16

16

19

26

16

29

24

34

39

34

31

31

13

31

16

36

37

37

27

30

13

32

28

40

20

34

14

25

36

25

16

19

18

20

23

34

20

28

22

19

16

19

22

16

26

31

16

30

14

16

29

16

25

24

26

23

26

16

13

13

16

16

19

20

17

17

16

16

16

19

17

16

17

13

13

13

19

13

16

16

19

13

17

16

16

16

20

23

16

16

19

20

16

17

17

14

11

16

14

16

19

20

19

22

19

16

16

19

16

16

13

17

19

16

16

13

17

19

18

19

20

16

16

16

16

16

16

16

16

16

16

19

16

16

16

16

16

16

16

19

19

16

16

16

16

37

28

25

31

25

19

22

19

27

39

24

22

25

27

28

19

28

25

31

24

26

19

46

25

27

22

16

19

22

25

28

31

19

16

19

22

38

19

19

19

33

25

25

22

22

20

22

22

22

26

16

17

20

23

23

22

25

40

18

18

17

17

15

17

17

15

20

15

17

20

18

15

17

20

17

15

17

15

15

15

15

15

15

17

15

17

15

15

15

15

17

20

15

18

20

20

15

20

17

17

15

17

15

20

17

17

17

15

17

17

17

17

20

15

15

17

17

17

17

17

17

17

17

17

15

17

17

17

16

17

17

17

17

17

17

17

17

13

17

17

20

17

17

17

20

20

17

20

16

13

13

13

13

13

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

17

14

17

14

14

14

14

17

14

14

19

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

14

12

14

14

14

14

14

14

17

17

17

14

12

17

14

12

14

14

14

14

14

30

21

21

27

24

25

24

27

30

24

25

30

18

18

27

21

22

25

27

31

22

16

27

19

41

56

44

42

41

38

43

43

40

43

40

40

31

23

23

23

23

23

23

26

23

12

12

15

16

15

16

12

15

23

31

27

30

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

15

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

10

12

12

12

12

12

12

12

12

12

12

12

12

10

12

12

12

12

12

12

12

12

12

12

10

10

12

12

12

12

12

12

12

12

12

12

10

12

10

12

12

10

12

12

12

10

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

10

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

10

12

10

12

12

12

12

12

12

12

12

12

12

12

12

12

10

12

12

12

12

12

12

12

12

12

12

12

10

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

12

15

18

18

18

21

18

18

18

18

27

15

18

18

27

21

15

18

15

21

18

21

21

21

29

16

13

18

28

21

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

15

18

12

15

15

12

13

15

18

15

12

15

15

15

15

15

18

15

15

15

13

In [None]:
print()
for key,value in details.items():
    if key == 1:
        break
word_count = {}
for item in value[2]:
    for string in data_keywords[item].split(" "):
        if string in word_count:
            word_count[string] += 1
        else:
            word_count[string] = 1
word_count_list = sorted(word_count.items(),
                         key=lambda item: item[1],
                         reverse=True)
for i in range(len(word_count_list)):
    print(word_count_list[i])