### dependence ###

In [6]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import hierarchical
from sklearn.cluster import estimate_bandwidth
from sklearn.metrics import calinski_harabaz_score
from sklearn.metrics import silhouette_score
from jieba.analyse import *
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# from Bio.Cluster import kcluster
# import synonyms
import pandas as pd
import jieba
jieba.add_word("报警人", freq=56, tag='nr')
jieba.add_word("系", freq=56, tag='r')
from math import isnan
import math
import numpy as np
import re
import csv
import jieba.posseg as pseg

### main function ###

In [7]:
n_clusters = 24

def deal_sklearn_model(model,
                       weight,
                       data,
                       df,
                       X,
                       print_details=0,
                       distance='euclidean'):
    kind = df[1].tolist()
    if print_details:
        print(model.labels_)
        print(len(model.labels_))
        details = {}
        for i in range(len(data)):
            if model.labels_[i] not in details:
                details[model.labels_[i]] = [[data[i]], [kind[i]]]
            else:
                details[model.labels_[i]][0].append(data[i])
                details[model.labels_[i]][1].append(kind[i])
        number = 0
        for key, list in details.items():
            this_kind = 0
            print("第%d类" % key)
            for i in range(len(list[0])):
                this_kind = this_kind + 1
                print("属于", list[1][i], ' ', list[0][i])
            print(this_kind)
            print()
            print()
            print()
            number += this_kind
        print("all_number == ", number)
    check_result(weight, model, distance)


    # K-Means 算法
def K_Means(weight, data, df, X, print_details=0):
    print("K_Means算法")
    iter_number = 700
    kmodel = KMeans(n_clusters=n_clusters,
                    n_init=10,
                    max_iter=iter_number,
                    init='k-means++',
                   n_jobs=-1).fit(weight)
    deal_sklearn_model(kmodel, weight, data, df, X, print_details)
    return kmodel.labels_


# DBSCAN 算法
def AAedion_DBSCAN(weight, data, df, X, print_details=0, distance='euclidean'):
    print("DBSCAN算法")
    DBmodel = DBSCAN(eps=0.5, min_samples=10, metric=distance,n_jobs=-1).fit(weight)
    deal_sklearn_model(DBmodel, weight, data, df, X, print_details, distance)
    print("DBSCAN算法完成")
    return DBmodel.labels_


def AAedion_Birch(weight, data, df, X, print_details=0, distance='euclidean'):
    print('Birch算法')
    Birch_model = Birch(n_clusters=n_clusters).fit(weight)
    deal_sklearn_model(
        Birch_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Birch_model.labels_


def AAedion_SpectralClustering(weight,
                               data,
                               df,
                               X,
                               print_details=0,
                               distance='euclidean',n_jobs=-1):
    print('SpectralClustering算法')
    Spe_model = SpectralClustering(n_clusters=n_clusters).fit(weight)
    deal_sklearn_model(
        Spe_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Spe_model.labels_


def AAedion_AffinityPropagation(weight,
                                data,
                                df,
                                X,
                                print_details=0,
                                distance='euclidean'):
    print('AffinityPropagatio算法')
    Aff_model = AffinityPropagation(damping=0.5,
                                    max_iter=200,
                                    convergence_iter=15,
                                    copy=True,
                                    preference=None,
                                    affinity='euclidean',
                                    verbose=False).fit(weight)
    deal_sklearn_model(
        Aff_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return Aff_model.labels_


def AAedion_Meanshift(weight,
                      data,
                      df,
                      X,
                      print_details=0,
                      distance='euclidean'):
    print('MeanShift算法')
#     bandwidth = estimate_bandwidth(weight, quantile=0.3,n_jobs=-1)
#    print(bandwidth)
    MeanShift_model = MeanShift(bandwidth=1,
                                seeds=None,
                                bin_seeding=True,
                                min_bin_freq=1,
                                cluster_all=False,
                                n_jobs=-1).fit(weight)
    print('Meanshfit finish')
    deal_sklearn_model(
        MeanShift_model,
        weight,
        data,
        df,
        X,
        print_details,
    )
    return MeanShift_model.labels_


def check_result(weight, model, distance):
    #print("共%d类" %model.labels_)
    print("result of harabaz_score: ",
          calinski_harabaz_score(weight, model.labels_))
    print("result of silhouette_score: ",
          silhouette_score(weight, model.labels_, metric='euclidean'))


def print_TF_IDF_values():
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重
        print("-------这里输出第", i, u"类文本的词语tf-idf权重------")
        # for j in range(len(word)):
        print(list(zip(all_word, weight[i])))

### stop word ###

In [8]:
    # 人工停用词表
    special_word = ['称', '，', '。', '在', '年', '月', '(', '（','民警','报警','观音桥','称有','称一','称车','称其','接警','人称']

    # 加载停用词表
    Stopword_FilePath = "../stopWord.txt"
    stopWordList = pd.read_csv(Stopword_FilePath, sep="\r")
    for word in special_word:
        stopWordList.loc[stopWordList.size] = word
    print("停用词表的大小是", stopWordList.size)

    # 加载数据
    FilePath = "../data.csv"
    df = pd.read_csv(FilePath,header=None)

    # 停用词过滤
    data = df[6]
    kind = df[2]
    X = []
    delete = []
    record = {}
    person_number = 0
    place_number = 0
    pronoun_number = 0
    beside_number = 0
    all_number = 0
    flag_record = {}
    flag_form = {}
    flag_result = {}
    data_keywords = []
    print(len(data))
    data_new = []
    kind_new = []
    k = 0
    for i in range(len(data)):
        try:
            if isnan(float(data[i])):
                k += 1
        except ValueError:    
            data_new.append(data[i])
            kind_new.append(kind[i])
    data = data_new
    kind = kind_new
    for i in range(len(data)):
        cut_result = []
        person_name = []
        place_name = []
        pronoun = []
        beside = []
        # 按词性过滤
        #print(data[i])
        k = pseg.cut(data[i])
        for word, flag in k:
            flag_form[word] = flag
            if flag not in flag_record:
                flag_record[flag] = 1
            else:
                flag_record[flag] += 1
                
            if flag == 'nr' or flag == 'nr1' or flag == 'nr2':
                person_name.append(word)
                person_number += 1
                continue
            if flag == 'ns' or flag == 'nz':
                place_name.append(word)
                place_number += 1
                continue
            if flag == 'r' or flag == 'rr' or flag == 'rz':
                pronoun.append(word)
                pronoun_number += 1
                continue
            if flag == 'w' or flag == 'h' or flag == 'k' or flag == 'xx' \
                    or flag == 'o' or flag == 'u' or flag == 'm' or flag == 'd' \
                    or flag == 'f':
                beside.append(word)
                beside_number += 1
                continue
        # 提取关键词列表
        for keyword, weight in extract_tags(data[i], withWeight=True):
            if keyword in person_name or keyword in place_name or keyword in pronoun or keyword in beside:
                all_number += 1
                delete.append(keyword)
                continue
            # if flag_form[keyword] not in flag_result:
            #     flag_result[flag_form[keyword]] = [keyword]
            # else:
            #     flag_result[flag_form[keyword]].append(keyword)
            cut_result.append(keyword)

        # 用停用词库筛选关键词列表
        temp = ""
        for j in cut_result:
            if j in stopWordList.values or re.search("[0-9]+", j) is not None \
                    or re.search("[a-z]+", j) is not None or re.search("[A-Z]+", j) is not None:
                if j not in delete:
                    delete.append(j)
            else:
                if temp == '':
                    temp = j
                else:
                    temp = temp + " " + j
                if j in record:
                    record[j] = record[j] + 1
                else:
                    record[j] = 1
        X.append(temp)
        data_keywords.append(temp)
    flag_record = sorted(flag_record.items(), key=lambda item: item[1], reverse=True)
    result_X = []
    for string in X:
        temp = ''
        for j in string.split(' '):
            if j == '':
                continue
            if record[j] < 5:
                continue
            temp = temp + " " + j
            # print(j)
        result_X.append(temp)
    X_str = np.array(result_X)
    record = sorted(record.items(), key=lambda item: item[1], reverse=True)
    
#     print("被清除的词有", delete)
#     print("最终词典", record)
#     print("过滤结果", X_str)
#     print('person_number: ', person_number)
#     print('place_number: ', place_number)
#     print('pronoun_number: ', pronoun_number)
#     print('beside_number: ', beside_number)
#     print('all_number: ', all_number)
#     print('flag_record: ', flag_record)

    # for w, f in flag_result.items():
    #     print('%s类' % w)
    #     for str in f:
    #         print(str)
    #     print()
    #     print()
    #     print()

停用词表的大小是 2085
6481


### construction TF-IDF ###

In [9]:
vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
X = vectorizer.fit_transform(X_str)  # 将文本转为词频矩阵
tfidf = transformer.fit_transform(X)  # 计算tf-idf，
all_word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重

# print_TF_IDF_values()
for i in range(len(all_word)):
    print(all_word[i])
print("weight shape:", weight.shape)
print("word shape:", len(all_word))

一人
一村
一条街
一楼
一致意见
一辆车
七天
丈夫
三村
三轮车
上午
上当受骗
上报
上班
上网
上车
下午
下楼
下班
下车
下车时
不上
不予
不到
不合理
不愿
不慎
不接
不明
不清
不理
不用
不知去向
不见
不让
不详
不醒
不需
世纪
业主
丢失
两人
个人信息
中介
中心
中王
串串
串串香
串号
为准
为川
主人
主任
举报
久鼎
之星
乐园
乘坐
乘客
乘车
买于
买单
买成
买车
乱放
争吵
争执
争议
事发
事宜
事情
事故
二村
二楼
二维码
于桥
互殴
亚朵
交到
交巡
交巡警
交由
交给
交警
交通银行
产业园
产生分歧
产生纠纷
人员
人民币
人民法院
人用
人行道
人身安全
今早
付款
付钱
代买
代驾
价值
休息
优易购
会先
会员卡
伤情
伤痕
伤者
位置
住址
住宿
住户
住院部
体育
体育场
体育馆
侦查
便利店
保姆
保管
保险
保险公司
保险杠
信息
信用卡
修理
修理厂
修车
俱乐部
倒车
借款
借给
借贷
借钱
债务
债务纠纷
值班
假冒伪劣
假币
假钞
假钱
做好
做工
做生意
停发
停在逸创
停放
停机
停止
停电
停车
停车位
停车场
停车库
停车费
偷走
储蓄卡
催债
催款
儿子
元假
元后
元许
充值
充电
先到
先行
入住
入口处
入室
全责
八建
八街
公交站
公交车
公交车站
公共
公司
公司员工
公安机关
公寓
公社
六月
关机
关门
关闭
兴业银行
其于
其停
其到
其去
其向
其接
其来
其称
具体位置
具体地址
具体情况
养生
兼职
内包
内因
内存
内挡
内有
内环
冒充
冒用
农业银行
冰毒
凌晨
减小
几人
凯日
出具
出租
出租车
出行
出警
出面
分向
分手
分需
刑事案
刑事案件
刑警支队
刑警队
划伤
划拳
划线
刘皓
刚才
创意
到达
制止
刺伤
前去
前台
前夫
前女友
前妻
前往
前沿
前男友
劝导
劝开
劝解
劝说
办事处
办公室
办理
加油站
助力车
劳动局
劳动部门
劳资
劳资纠纷
勘验
包内
包包
包工头
包房
包被
区内
区域
区挡
医保卡
医生
医药费
医院
匿名
十字路口
协助
协商
协调
单元
单车
卖唱
卖家
卖方
卖淫
卖淫嫖娼
卖花
南汽城
占用
卡包
卡尔斯
印象
卸货
厕所
厨房
反锁
反馈
发来
发现
发现异常
发现自己
发生
发生争执
发

### Cluster ###

#### cluster ####

In [10]:
def clustering(print_details,weight,DB):
    KMeans_labels = K_Means(weight, data, df, X, print_details)
#     if DB == 1:
#         DS_labels = AAedion_DBSCAN(weight, data, df, X, print_details)
#     else:
#         DS_labels = AAedion_SpectralClustering(
#         weight, data, df, X, print_details)
    Birch_labels = AAedion_Birch(weight, data, df, X, print_details)
    all_labels = [
        KMeans_labels, #DS_labels, 
        Birch_labels
    ]
    #print(all_labels[0][1])
    return all_labels


# KMeans_labels = K_Means(weight, data, df, X, print_details)
# DBSCAN_labels = AAedion_DBSCAN(weight, data, df, X, print_details)
# Brich_labels = AAedion_Birch(weight, data, df, X, print_details)
# SpectralClustering_labels = AAedion_SpectralClustering(weight, data, df, X, print_details)

### can't use 

# AAedion_AffinityPropagation(weight, data, df, X, 1)
# AAedion_Meanshift(weight, data, df, X, 1)

#### change weight ####

In [17]:
def deal(all_labels):
    cluster_result = []
    for i in range(0, len(all_labels[0])):
        cluster_result.append('')
    for labels in all_labels:
        for i in range(0, len(labels)):
            cluster_result[i] = cluster_result[i] + " " + str(labels[i])
    return cluster_result


def Change(all_labels, weight):
    cluster_result = deal(all_labels)
    cluster_map = {}
    for i in range(len(cluster_result)):
        string = cluster_result[i]
        if string in cluster_map:
            cluster_map[string][0].append(data[i])
            cluster_map[string][1].append(i)
            cluster_map[string][2] += 1
        else:
            cluster_map[string] = [[data[i]], [i], 1]
    
    cluster_map = sorted(cluster_map.items(),
                         key=lambda item: item[1][2],
                         reverse=True)
    print("聚类总情况数: ", len(cluster_map))
    # for item in cluster_map:
    #     print("所属:  ", item[0], "数量： ", item[1][2])
    goal_words = {}
    for i in range(0, len(cluster_map) // 2):
        #         if cluster_map[i][1][2] < len(data)/40:
        #             continue
        print(i, cluster_map[i][1][2])
        wordcount_record = {}
        for item in cluster_map[i][1][1]:
            #     print(data_keywords[item])
            for word in data_keywords[item].split(" "):
                if word == ' ' or word == '':
                    continue
                if word in wordcount_record:
                    wordcount_record[word] += 1
                else:
                    wordcount_record[word] = 1
        wordcount_record = sorted(wordcount_record.items(),
                                  key=lambda item: item[1],
                                  reverse=True)
        # increase the first and second word
        alpha = 1.5
        for item in wordcount_record[0:2]:
            increase_rate = 1 + alpha * (item[1] / cluster_map[i][1][2])
            print(item[0], " ", item[1]," increase rate: ",increase_rate)
            if item[0] not in goal_words:
                weight = changeWeight(weight, all_word, item[0],
                                      1 + alpha * (item[1] / cluster_map[i][1][2]),
                                      0)
                goal_words[item[0]] = 1
    return weight

def Change_Chi(all_labels, weight):
    types = deal(all_labels) #类list
    corpus = data #数据类
    words = all_word #特征词
    
    cluster_result = types
    cluster_map = {}
    for i in range(len(cluster_result)):
        string = cluster_result[i]
        if string in cluster_map:
            cluster_map[string][0].append(data[i])
            cluster_map[string][1].append(i)
            cluster_map[string][2] += 1
        else:
            cluster_map[string] = [[data[i]], [i], 1]
    
    cluster_map = sorted(cluster_map.items(),
                         key=lambda item: item[1][2],
                         reverse=True)
    print("聚类总情况数: ", len(cluster_map))
    print("最大类的样本数为：", cluster_map[0][1][2])
    
    types_words_ratio = {}
    types_words_N = len(corpus)
    types_words_A = {} #属于某类别ci也含有特征词的文本数目
    types_words_B = {} #不属于某类别ci也含有特征词的文本数目
    types_words_C = {} #属于某类别ci但不含有特征词的文本数目
    types_words_D = {} #不属于某类别ci也不含有特征词的文本数目
    
    for i in range(len(cluster_map)):
        t = cluster_map[i][0]
        for w in words:
            pair = t + w;
            for i in range(len(corpus)):
                if types[i] == t and w in corpus[i]:
                    types_words_A[pair] = types_words_A.get(pair, 0) + 1
                    continue
                if types[i] != t and w in corpus[i]:
                    types_words_B[pair] = types_words_B.get(pair, 0) + 1
                    continue
                if types[i] == t and w not in corpus[i]:
                    types_words_C[pair] = types_words_C.get(pair, 0) + 1
                    continue
                if types[i] != t and w not in corpus[i]:
                    types_words_D[pair] = types_words_D.get(pair, 0) + 1
            
    words_suit_types = {} #CHI值所对应的类
    words_suit_marks = {} #CHI值
    words_types_marks = {} #不同类的CHI排序
    for i in range(len(cluster_map)):
        words_types_marks[cluster_map[i][0]] = {}
        
    for word in words:
        for i in range(len(cluster_map)):
            t = cluster_map[i][0]
            pair = t + word
            CHI = types_words_N * math.pow((types_words_A.get(pair, 0) * types_words_D.get(pair, 0) - types_words_B.get(pair, 0) * types_words_C.get(pair, 0)), 2) 
            CHI = CHI / ((types_words_A.get(pair, 0) + types_words_C.get(pair, 0)) * (types_words_B.get(pair, 0) + types_words_D.get(pair, 0)))
            CHI = CHI / ((types_words_A.get(pair, 0) + types_words_B.get(pair, 0)) * (types_words_C.get(pair, 0) + types_words_D.get(pair, 0)))
            if CHI > words_suit_marks.get(word, 0):
                words_suit_marks[word] = CHI
                words_suit_types[word] = t
        types = words_suit_types[word]
        marks = words_suit_marks[word]
        words_types_marks[types][word] = marks
            
    for key in words_types_marks.keys():
        temp = words_types_marks[key]
        if len(temp) == 0:
            continue
        wordcount_record = sorted(temp.items(),
                                  key=lambda item: item[1],
                                  reverse=True)
        # increase the first and second word
        alpha = 0.005
        if len(wordcount_record) >= 2:
            for item in wordcount_record[0:2]:
                increase_rate = 1 + alpha * item[1] / cluster_map[i][1][2]
                print(item[0], " ", item[1]," increase rate: ",increase_rate)
                weight = changeWeight(weight, all_word, item[0], 1 + alpha * (item[1] / cluster_map[i][1][2]),0)
        else:
            item = wordcount_record[0]
            increase_rate = 1 + alpha * item[1] / cluster_map[i][1][2]
            print(item[0], " ", item[1]," increase rate: ",increase_rate)
            weight = changeWeight(weight, all_word, item[0], 1 + alpha * (item[1] / cluster_map[i][1][2]),0)
            
    return weight
      
    

def changeWeight(weight, all_word, word, change_rate, print_details=0):
    change_array = np.eye(weight.shape[1])
    for i in range(len(all_word)):
        if all_word[i] == word:
            break
    change_array[i][i] = change_rate
    weight = np.dot(weight, change_array)
    return weight

#### training ####

In [18]:
print_details = 0
DB = 0
for i in range(0, 1):
    print('第%d次' % (i + 1))
    all_labels = clustering(print_details, weight, DB)
    weight = Change_Chi(all_labels, weight)

第1次
K_Means算法




result of harabaz_score:  34.398627896781136
result of silhouette_score:  0.027353240016782077
Birch算法




result of harabaz_score:  24.86988909316566
result of silhouette_score:  0.0028500975515811824
聚类总情况数:  247
最大类的样本数为： 648
死亡   163.08678767658392  increase rate:  1.8154339383829197
精神病   109.51301935789515  increase rate:  1.5475650967894756
出警   1176.3677290262879  increase rate:  6.881838645131439
不需   596.0213460251091  increase rate:  3.9801067301255455
拟立   2129.4033445386863  increase rate:  11.647016722693431
案件   2035.2379737064537  increase rate:  11.17618986853227
车上   2385.288005338138  increase rate:  12.926440026690692
下车   2339.2141511468662  increase rate:  12.69607075573433
车辆通行   2364.7413580627585  increase rate:  12.823706790313793
后经   1310.3257242580446  increase rate:  7.551628621290223
噪音   3392.7828564057368  increase rate:  17.963914282028686
扰民   3373.513189268558  increase rate:  17.86756594634279
派出所   958.7781132771189  increase rate:  5.793890566385595
旅客   749.0544239929046  increase rate:  4.745272119964524
透露   572.4087752210108  increase rate:  3.8620

平台   141.55514383019914  increase rate:  1.7077757191509957
借给   69.91994819574033  increase rate:  1.3495997409787017
包包   35.15745373391044  increase rate:  1.175787268669552
解决问题   283.6524673678742  increase rate:  2.418262336839371
抵押   175.88307154778792  increase rate:  1.8794153577389396
系汪   69.91994819574033  increase rate:  1.3495997409787017
路边   34.61608123866542  increase rate:  1.173080406193327
好利来   377.7369254048383  increase rate:  2.8886846270241913
叫醒   197.4302050247244  increase rate:  1.987151025123622
亚朵   610.285349032385  increase rate:  4.051426745161925
盗刷   430.9379428658527  increase rate:  3.1546897143292636
逃离现场   1171.6895258170834  increase rate:  6.8584476290854175
打人者   877.0748100886435  increase rate:  5.385374050443217
放学   491.85703684209494  increase rate:  3.459285184210475
喝完   290.925974516889  increase rate:  2.4546298725844453
户名   565.3994534122489  increase rate:  3.8269972670612447
账号   302.4711818087474  increase rate:  2.5123559090437

违停   271.42940461354226  increase rate:  2.3571470230677116
车子   223.38007000395035  increase rate:  2.116900350019752
合租房   271.42940461354226  increase rate:  2.3571470230677116
被划   189.50151102028437  increase rate:  1.9475075551014218
物业   150.2682469300006  increase rate:  1.751341234650003
印象   104.54039578558772  increase rate:  1.5227019789279386
称逸汇   380.66706333049103  increase rate:  2.903335316652455
修理厂   271.42940461354226  increase rate:  2.3571470230677116
系朱   237.2927238449973  increase rate:  2.1864636192249867
子女   189.50151102028437  increase rate:  1.9475075551014218
过户   316.94505683453264  increase rate:  2.584725284172663
误报   271.42940461354226  increase rate:  2.3571470230677116
当事人   85.23156365327492  increase rate:  1.4261578182663746
产生纠纷   54.565926352399714  increase rate:  1.2728296317619985
打开   81.4537998254126  increase rate:  1.407268999127063
公安机关   89.36913693469786  increase rate:  1.4468456846734894
卡包   316.94505683453264  increase rate:  2.

#### print result ####

In [8]:
min_value = 0 
for i in range(30,40):
    kmodel = KMeans(n_clusters=i,
                    n_init=10,
                    max_iter=700,
                    init='k-means++',
                   n_jobs=-1).fit(weight)
    harabaz_value = calinski_harabaz_score(weight,kmodel.labels_)
    silhouette_value = silhouette_score(weight,kmodel.labels_)
    print('final value: ',harabaz_value," n:",i)
    if harabaz_value > min_value:
        min_value = harabaz_value
        labels_ = kmodel.labels_
    
details = {}
for i in range(len(data)):
    if labels_[i] not in details:
        details[labels_[i]] = [[data[i]], [kind[i]],[i]]
    else:
        details[labels_[i]][0].append(data[i])
        details[labels_[i]][1].append(kind[i])
        details[labels_[i]][2].append(i)

        
with open("输出.csv","w",encoding="utf-8",newline="") as datacsv:
#dialect为打开csv文件的方式，默认是excel，delimiter="\t"参数指写入的时候的分隔符
    csvwriter = csv.writer(datacsv,dialect = ("excel"))
    #csv文件插入一行数据，把下面列表中的每一项放入一个单元格（可以用循环插入多行）
    for key, list in details.items():
        for i in range(len(list[0])):
            csvwriter.writerow([key, list[1][i], list[0][i]])
        



final value:  225.0407821039231  n: 30




final value:  221.04635472098659  n: 31




final value:  217.72154782468797  n: 32




final value:  216.11172229124563  n: 33




final value:  208.85776081777684  n: 34




final value:  210.44066063860683  n: 35




final value:  212.32033904504766  n: 36




final value:  204.09762939585852  n: 37




final value:  205.40876985538236  n: 38




final value:  202.1686626203936  n: 39


In [9]:
print()
for key,value in details.items():
    if key == 1:
        break
word_count = {}
for item in value[2]:
    for string in data_keywords[item].split(" "):
        if string in word_count:
            word_count[string] += 1
        else:
            word_count[string] = 1
word_count_list = sorted(word_count.items(),
                         key=lambda item: item[1],
                         reverse=True)
for i in range(len(word_count_list)):
    print(word_count_list[i])


('求助', 1053)
('纠纷', 745)
('手机', 192)
('出租车', 173)
('醉酒', 154)
('朋友', 141)
('电话', 126)
('', 113)
('闹事', 111)
('男子', 111)
('发生', 109)
('离开', 99)
('劳资纠纷', 91)
('未到', 90)
('消费', 88)
('派出所', 82)
('担心', 82)
('遗失在', 79)
('接到', 77)
('发现', 68)
('地址', 67)
('不清', 65)
('诈骗', 64)
('公司', 63)
('不让', 63)
('人系', 60)
('疑似', 60)
('受伤', 60)
('一人', 59)
('消防', 54)
('网上', 53)
('家中', 52)
('称一人', 51)
('女子', 51)
('酒店', 50)
('贷款', 50)
('骚扰电话', 49)
('骚扰', 49)
('人称', 48)
('租赁', 45)
('医院', 45)
('怀疑', 44)
('无人', 43)
('身份证', 42)
('酒吧', 41)
('老人', 40)
('转账', 39)
('工作人员', 38)
('信息', 38)
('老板', 38)
('责令', 38)
('称因', 37)
('不到', 37)
('收到', 37)
('八街', 37)
('殴打', 37)
('回拨', 36)
('举报', 36)
('车牌', 35)
('补充', 35)
('微信', 35)
('情况', 35)
('车上', 34)
('车位', 34)
('持械', 34)
('感情', 33)
('小孩', 33)
('昨晚', 32)
('警察', 32)
('拿走', 32)
('陌生', 32)
('家暴', 32)
('拟立', 31)
('经济纠纷', 31)
('短信', 31)
('自称', 31)
('被困', 30)
('凌晨', 30)
('赔偿', 29)
('现场', 29)
('儿子', 29)
('司机', 29)
('迷路', 29)
('车辆', 28)
('客人', 27)
('回复', 27)
('母亲', 27)
('女友', 27)
('金额', 2

('娱乐城', 1)
('忠恒', 1)
('魔方', 1)
('建东', 1)
('咖啡馆', 1)
('粉里', 1)
('派出', 1)
('生活馆', 1)
('标示', 1)
('必旺客', 1)
('诗城美', 1)
('鲜菜', 1)
('火锅', 1)
('切断', 1)
('电源', 1)
('标志', 1)
('聚强', 1)
('发型', 1)
('九炉匠', 1)
('商场', 1)
('物业管理', 1)
('食品店', 1)
('副食店', 1)
('果超', 1)
('某征', 1)
('圣诞树', 1)
('借贷', 1)
('进屋', 1)
('没退', 1)
('按揭', 1)
('不归坏', 1)
('地处', 1)
('被关', 1)
('搞错', 1)
('找人', 1)
('保险公司', 1)
('现来', 1)
('三份', 1)
('套路', 1)
('过户', 1)
('婚骗', 1)
('系张', 1)
('因张', 1)
('将店', 1)
('受了伤', 1)
('情侣', 1)
('冰箱', 1)
('故意伤害', 1)
('撞门', 1)
('金饰', 1)
('窗边', 1)
('身患', 1)
('寿命', 1)
('疾病', 1)
('撕掉', 1)
('来此', 1)
('案底', 1)
('老城', 1)
('赵家', 1)
('不带', 1)
('看病', 1)
('现抓', 1)
('所内', 1)
('另一方', 1)
('外伤', 1)
('带人', 1)
('称现', 1)
('现把', 1)
('前日', 1)
('隐私', 1)
('怀孕', 1)
('银地', 1)
('赖着', 1)
('手镯', 1)
('拖上', 1)
('上海大众', 1)
('银色', 1)
('恋爱', 1)
('猫眼', 1)
('骗财骗色', 1)
('关系', 1)
('劝离', 1)
('做法', 1)
('烟点', 1)
('确切', 1)
('烟越', 1)
('速核', 1)
('房顶', 1)
('这是', 1)
('大火', 1)
('烧毁', 1)
('那座', 1)
('住宅楼', 1)
('情况不明', 1)
('人员伤亡', 1)
('公寓楼', 1)
('更大', 1)
('