In [1]:
import json
import sys
from sklearn.base import BaseEstimator, ClusterMixin
import jieba
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import DBSCAN, AgglomerativeClustering, HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, adjusted_rand_score
import spacy
from sklearn.metrics.cluster import contingency_matrix

from docx import Document

import config
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import adjusted_mutual_info_score, v_measure_score

import openpyxl

stopwords_path = './stopwords-master/hit_stopwords.txt'

def cos_(ver,data_1):
    data_1 = data_1.tolist()
    cos_sims = []
    ver = ver.toarray()
    for j in range(len(ver)):
        rows = []
        for i in range(len(ver)):
            sim = util.cos_sim(ver[j],ver[i])
            rows.append(sim.tolist()[0][0])
        cos_sims.append(rows)

    #存储成Excel
    workbook = openpyxl.Workbook()
    worksheet = workbook.active

    worksheet.append(['']+data_1)
    for data,val in zip(data_1,cos_sims):
        worksheet.append([data]+val)

    workbook.save(config.next_feature_ext.ext+".xlsx")
    workbook.close()

    return sp.sparse.csr_matrix(cos_sims)

def npeMethod(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

def data_post(data, cluster, file_name):
    data['labels_'] = cluster.labels_

    # 存储为json
    json_data = {}
    for i in range(max(cluster.labels_) + 1):
        json_data[f"Cluster {i + 1}"] = list(data[data['labels_'] == i]['discription'])
    json_data[f"Noise:"] = list(data[data['labels_'] == -1]['discription'])

    # file_name = '' + next_feature_ext + '_' + next_cluster_me + '.json'
    with open('./result/' + file_name+".json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
    # json.dump(json_data,f)

def purity_score(y_true, y_pred):
    # 计算纯度
    # 使用混淆矩阵（contingency matrix），其中每一列代表一个真实类别，每一行代表一个聚类
    cont_matrix = contingency_matrix(y_true, y_pred)

    # 对于每个聚类（行），找出数量最多的真实类别（列）的数量
    # 然后将这些数量加起来
    # 最后，将总数除以样本总数以计算纯度
    return np.sum(np.amax(cont_matrix, axis=0)) / np.sum(cont_matrix)

In [2]:
# 读取数据
data_path = "JayMe标注.xlsx"
data = pd.read_excel(data_path, sheet_name="Sheet1")  # , sep=',')
data_ = data["缺陷描述"]
# print(data_)
# TODO 指标
y_true = data['标签']  # 这个地方从excel中提取

# TODO 进行分词

#  分词以及去除停用词
def new_cut(data):
    data = data.apply(lambda x: ' '.join(jieba.lcut(x)))
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stop_words = [line.strip() for line in f.readlines()]
    # stop_words += (['\n', ' ', '_x000D_', '摘要', '进行', '进行了', '操作', 'bug', '出现', '\\', 'n'])
    stop_words.append('\n')
    stop_words.append(' ')
    stop_words.append('_x000D_')
    # stop_words.append('摘要')
    sentences = []
    for i in range(len(data)):
        word = list(jieba.cut(data[i]))
        word = [word1 for word1 in word if not word1 in stop_words]
        # data[i] = ' '.join(word)
        sentences.append(word)
    return sentences

data_1 = new_cut(data_)
# print(data_1)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\17143\AppData\Local\Temp\jieba.cache
Loading model cost 0.366 seconds.
Prefix dict has been built successfully.


In [3]:
# 读取bert的模型
model = SentenceTransformer(
    'LaBSE',  # 要使用的预训练模型
    cache_folder=r"./model"  # 指定该模型在本地的缓存路径
)

In [4]:
# sentence_bert
def bert_exr(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(' '.join(data[i]))

    features = model.encode(sentences)
    features = sp.sparse.csr_matrix(features)
    return features

ver_sb = bert_exr(data_1)

In [5]:
# HDBSCAN
def hdb_cluster(data):
    clustering = HDBSCAN(min_cluster_size=2, min_samples=1,
                         cluster_selection_epsilon=0.71,alpha=1,cluster_selection_method='eom',metric='euclidean',
                         allow_single_cluster=False,
                         ).fit(data)
    return clustering
hdb_cluster_sb = hdb_cluster(ver_sb)
def output(y_true,hdb_cluster_sb):
    # 纯度
    print('================HDB-Sentence-Bert==============')
    print('======================纯度======================')
    purity_hdb_cluster_sb = purity_score(y_true, hdb_cluster_sb.labels_)
    print("Purity HDBSCAN Clusters (sb):", purity_hdb_cluster_sb)
    # 调整互信息
    ami_hdb_cluster_sb = adjusted_mutual_info_score(y_true, hdb_cluster_sb.labels_)
    print('====================调整后互信息====================')
    print("Adjusted Mutual Info Score HDBSCAN Clusters (sb):", ami_hdb_cluster_sb)
    # V-measure
    v_measure_hdb_cluster_sb = v_measure_score(y_true, hdb_cluster_sb.labels_)
    print('=====================V-measure====================')
    print("V-measure Score HDBSCAN Clusters (sb):", v_measure_hdb_cluster_sb)

output(y_true,hdb_cluster_sb)

Purity HDBSCAN Clusters (sb): 0.5561959654178674
Adjusted Mutual Info Score HDBSCAN Clusters (sb): 0.4606449282882575
V-measure Score HDBSCAN Clusters (sb): 0.6316505944578188
