In [None]:
import csv
import itertools
import json
import locale

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from sklearn.metrics import silhouette_score


locale.setlocale(locale.LC_COLLATE, 'chinese')

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

label_mapping = {
    '省级行政区': 'PLAD',
    **{k: 'theme' for k in ['数据主题', '领域', '主题分类', '数据领域', '主题', '行业领域']},
    **{k: 'format' for k in ['格式', '文件格式', '数据格式', '文件类型']},
    **{k: 'accessLevel' for k in ['开放条件', '开放状态', '开放类型', '开放属性', '开放模式']},
    **{k: 'accrualPeriodicity' for k in ['更新周期', '更新频率']},
    **{k: 'resourceType' for k in ['服务类型', '资源类型']},
}

def standardized(data):
    return [
        {
            label_mapping.get(key, key): value
            for key, value in record.items()
        }
        for record in data
    ]

with open(r'output/json/vocabulary.json', 'r', encoding='utf-8') as file:
    vocabulary = json.load(file)

vocabulary = standardized(vocabulary)
df = pd.DataFrame(vocabulary).drop(columns=['accrualPeriodicity', 'resourceType'])

In [None]:
df.info()
df

In [None]:
all_themes = sorted([theme for themes in df['theme'] for theme in themes])
wordcloud = WordCloud(font_path='simhei.ttf', width=2400, height=1200,
                      background_color='white', collocations=False).generate(' '.join(all_themes))

plt.figure(figsize=(16, 8), dpi=160)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.savefig(r'output/wordcloud/vocabulary.png')
plt.show()

In [None]:
heatmap_df = df.explode('theme').groupby(['PLAD', 'theme']).size().unstack(fill_value=0)
heatmap_df = heatmap_df.loc[:, heatmap_df.sum(axis=0).sort_values(ascending=False).index]

In [None]:
from sklearn.cluster import KMeans

def plot_kmeans_clusters(data):
    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(data)
    data['cluster'] = clusters
    
    for cluster in sorted(data['cluster'].unique()):
        cluster_data = data[data['cluster'] == cluster].drop(columns='cluster')
        cluster_data = cluster_data.loc[:, (cluster_data != 0).any(axis=0)]
    
        sorted_columns = cluster_data.sum(axis=0).sort_values(ascending=False).index
        cluster_data = cluster_data[sorted_columns]
    
        plt.figure(figsize=(15, 10))
        sns.heatmap(cluster_data, cmap='YlGnBu', annot=True, fmt='d')
        plt.xlabel('主题')
        plt.ylabel('省级行政区')
        plt.title(f'聚类 {cluster} 内的主题词热图')
        plt.xticks(rotation=45, ha='right')
        plt.show()

plot_kmeans_clusters(heatmap_df.copy())

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster


def plot_hierarchical_clusters(data, method='ward', metric='euclidean', num_clusters=4):
    Z = linkage(data, method=method, metric=metric)
    
    plt.figure(figsize=(12, 6))
    dendrogram(Z, labels=data.index, leaf_rotation=90)
    plt.xlabel('省级行政区')
    plt.ylabel('距离')
    plt.title('省级行政区层次聚类树状图')
    plt.show()

    data['cluster'] = fcluster(Z, t=num_clusters, criterion='maxclust')
    for cluster in sorted(data['cluster'].unique()):
        cluster_data = data[data['cluster'] == cluster].drop(columns='cluster')
        cluster_data = cluster_data.loc[:, (cluster_data != 0).any(axis=0)]
        sorted_columns = cluster_data.sum(axis=0).sort_values(ascending=False).index
        cluster_data = cluster_data[sorted_columns]

        plt.figure(figsize=(15, 10))
        sns.heatmap(cluster_data, cmap='inferno', cbar=False, square=True)
        plt.xlabel('主题')
        plt.ylabel('省级行政区')
        plt.title(f'聚类 {cluster} 内的主题词热图')
        plt.xticks(rotation=45, ha='right')
        plt.show()

def find_optimal_clusters(data, method='ward', metric='euclidean', max_clusters=10):
    data = data.copy()
    Z = linkage(data, method=method, metric=metric)
    silhouette_scores = []
    for num_clusters in range(2, max_clusters + 1):
        clusters = fcluster(Z, t=num_clusters, criterion='maxclust')
        score = silhouette_score(data, clusters, metric=metric)
        silhouette_scores.append(score)

    plt.figure(figsize=(10, 6))
    plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
    plt.xlabel('聚类数')
    plt.ylabel('轮廓系数')
    plt.title('不同聚类数的轮廓系数')
    plt.show()

plot_hierarchical_clusters(data=heatmap_df.copy(), method='ward', metric='euclidean', num_clusters=4)

In [None]:
from sklearn.cluster import DBSCAN

def dbscan_grid_search(data, eps_range, min_samples_range):
    def score_func(params):
        clusters = DBSCAN(eps=params[0], min_samples=params[1]).fit_predict(data)
        if len(set(clusters)) > 1:
            score = silhouette_score(data, clusters)
            return score, params
        else:
            return -1, params

    _, best_params = max(map(score_func, itertools.product(eps_range, min_samples_range)),
                         key=lambda x: x[0])
    return best_params

eps_range = np.arange(0.5, 5.5, 0.5)
min_samples_range = np.arange(2, 6, 1)
best_eps, best_min_samples = dbscan_grid_search(heatmap_df, eps_range, min_samples_range)
print(f'最佳参数: eps={best_eps}, min_samples={best_min_samples}')


def plot_dbscan_clusters(data, cluster_data):
    def display_noise_data(noise_df):
        def sorted_by_pinyin(sequence):
            return sorted(sequence, key=locale.strxfrm)
        
        def col_names_to_text(row, sep=', '):
            return sep.join(sorted_by_pinyin(row.index))
        
        if not noise_df.empty:
            noise_df = noise_df.loc[:, (noise_df != 0).any(axis=0)].copy()
            noise_df = pd.DataFrame(noise_df.apply(lambda row: col_names_to_text(row[row != 0]), axis=1),
                                    index=noise_df.index, columns=['themes'])
            noise_df.to_csv(rf'output/clusters/noises.csv',
                            encoding='utf-8-sig', quoting=csv.QUOTE_ALL)

    def plot_dbscan_cluster(cluster):
        def sorted_by_sum_and_pinyin(df):
            sums = df.sum(axis=0)
            return df[sorted(df.columns, key=lambda col: (-sums[col], locale.strxfrm(col)))]
        
        cluster_heatmap_data = data[get_cluster_mask(cluster)]
        cluster_heatmap_data = cluster_heatmap_data.loc[:, (cluster_heatmap_data != 0).any(axis=0)]

        plt.figure(figsize=(15, 9), dpi=160)
        sns.heatmap(sorted_by_sum_and_pinyin(cluster_heatmap_data),
                    cmap='magma', square=True, cbar=False)
        plt.xlabel('主题')
        plt.ylabel('省级行政区')
        plt.title(f'聚类 {cluster} 内的主题词热图')
        plt.xticks(rotation=30, ha='right')
        plt.tight_layout()
        plt.savefig(rf'output/clusters/cluster_{cluster}_heatmap.png')
        plt.show()

    def get_cluster_mask(cluster):
        return cluster_data['cluster'] == cluster
    
    display_noise_data(data[get_cluster_mask(-1)])
    
    data = data[~get_cluster_mask(-1)]
    cluster_data = cluster_data[~get_cluster_mask(-1)]

    for cluster in sorted(cluster_data['cluster'].unique()):
        plot_dbscan_cluster(cluster)


clusters = DBSCAN(eps=best_eps, min_samples=best_min_samples).fit_predict(heatmap_df)
cluster_df = pd.DataFrame(clusters, index=heatmap_df.index, columns=['cluster'])
plot_dbscan_clusters(heatmap_df, cluster_df)

In [None]:
import geopandas as gpd

China = gpd.read_file('中华人民共和国.geojson')
China = pd.merge(China, cluster_df, left_on='name', right_on='PLAD', how='left')

In [None]:
_, ax = plt.subplots(figsize=(15, 15), dpi=160)

China.boundary.plot(ax=ax, linewidth=1)
China.plot(column='cluster', ax=ax, legend=True, cmap='Dark2', categorical=True, 
           legend_kwds={'labels': ['噪声点', '聚类0', '聚类1', '聚类2', '未知']},
           missing_kwds={'color': 'lightgray'})

plt.title('中国各省级行政区主题词汇表聚类图')
plt.tight_layout()
plt.savefig(rf'output/clusters/cluster_cnmap.png')
plt.show()