In [None]:
import json
import sys
from sklearn.base import BaseEstimator, ClusterMixin
import jieba
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import DBSCAN, AgglomerativeClustering, HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, adjusted_rand_score
import spacy
from sklearn.metrics.cluster import contingency_matrix

from docx import Document

import config
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import adjusted_mutual_info_score, v_measure_score

import openpyxl

stopwords_path = './stopwords-master/hit_stopwords.txt'

def cos_(ver,data_1):
    data_1 = data_1.tolist()
    cos_sims = []
    ver = ver.toarray()
    for j in range(len(ver)):
        rows = []
        for i in range(len(ver)):
            sim = util.cos_sim(ver[j],ver[i])
            rows.append(sim.tolist()[0][0])
        cos_sims.append(rows)


    workbook = openpyxl.Workbook()
    worksheet = workbook.active

    worksheet.append(['']+data_1)
    for data,val in zip(data_1,cos_sims):
        worksheet.append([data]+val)

    workbook.save(config.next_feature_ext.ext+".xlsx")
    workbook.close()

    return sp.sparse.csr_matrix(cos_sims)

def npeMethod(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

def data_post(data, cluster, file_name):
    data['labels_'] = cluster.labels_


    json_data = {}
    for i in range(max(cluster.labels_) + 1):
        json_data[f"Cluster {i + 1}"] = list(data[data['labels_'] == i]['discription'])
    json_data[f"Noise:"] = list(data[data['labels_'] == -1]['discription'])

    # file_name = '' + next_feature_ext + '_' + next_cluster_me + '.json'
    with open('./result/' + file_name+".json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
    # json.dump(json_data,f)

def purity_score(y_true, y_pred):

    cont_matrix = contingency_matrix(y_true, y_pred)


    return np.sum(np.amax(cont_matrix, axis=0)) / np.sum(cont_matrix)

In [None]:

def read_data():
    data_path = "JayMe标注.xlsx"
    data = pd.read_excel(data_path, sheet_name="Sheet1")  # , sep=',')
    data_ = data["缺陷描述"]
    y_true = data['标签']


    def new_cut(data):
        data = data.apply(lambda x: ' '.join(jieba.lcut(x)))
        with open(stopwords_path, 'r', encoding='utf-8') as f:
            stop_words = [line.strip() for line in f.readlines()]
        stop_words.append('\n')
        stop_words.append(' ')
        stop_words.append('_x000D_')
        sentences = []
        for i in range(len(data)):
            word = list(jieba.cut(data[i]))
            word = [word1 for word1 in word if not word1 in stop_words]
            # data[i] = ' '.join(word)
            sentences.append(word)
        return sentences

    data_1 = new_cut(data_)
    return data_1,y_true

data_1,y_true = read_data()
# print(data_1)

In [None]:

model = SentenceTransformer(
    'LaBSE',
    cache_folder=r"./model"
)

In [None]:
# sentence_bert
def bert_exr(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(' '.join(data[i]))

    features = model.encode(sentences)
    features = sp.sparse.csr_matrix(features)
    return features

ver_sb = bert_exr(data_1)

In [None]:
# HC
def hc_cluster(data, n):
    hc = AgglomerativeClustering(
        n_clusters=n,
        affinity="deprecated",  # TODO(1.4): Remove
        metric=config.hc.metric,  # TODO(1.4): Set to "euclidean"
        memory=config.hc.memory,
        connectivity=config.hc.connectivity,
        compute_full_tree=config.hc.compute_full_tree,
        linkage=config.hc.linkage,
        distance_threshold=1.4,
        compute_distances=config.hc.compute_distances,
    ).fit(data.toarray())
    # hc = AgglomerativeClustering(n_clusters=n, linkage='average').fit(data.toarray())
    return hc

n = None
hc_cluster_sb = hc_cluster(ver_sb,n)
def output(y_true,hc_cluster_sb):
    purity_hc_cluster_sb = purity_score(y_true, hc_cluster_sb.labels_)
    print("Purity HC Clusters (sb):", purity_hc_cluster_sb)
    ami_hc_cluster_sb = adjusted_mutual_info_score(y_true, hc_cluster_sb.labels_)
    print("Adjusted Mutual Info Score HC Clusters (sb):", ami_hc_cluster_sb)
    v_measure_hc_cluster_sb = v_measure_score(y_true, hc_cluster_sb.labels_)
    print("V-measure Score HC Clusters (sb):", v_measure_hc_cluster_sb)

output(y_true,hc_cluster_sb)