In [23]:
import os
import ast
import math
import json
import torch
import openai
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from langchain.embeddings import OpenAIEmbeddings
from transformers import BertTokenizer, BertModel

In [24]:
with open('data/theme_map_list.txt', 'r') as f:
    theme_tags = eval(f.read()) # load theme tags
with open('data/thememap_embeddings_ada002.json', 'r') as f:
    theme_embeddings_dict = json.load(f)
unique_theme_tags = list(theme_embeddings_dict.keys()) # load unique tags
theme_embeddings = list(theme_embeddings_dict.values()) # load filtered tags

df_tag = pd.DataFrame()
df_tag['tag'] = pd.Series(list(set(theme_tags)))
tag_counts = {tag: theme_tags.count(tag) for tag in df_tag['tag'].unique()}
df_tag['count'] = df_tag['tag'].map(tag_counts)
# df_tag['filter_tag'] = df_tag['tag']
df_tag['filter_tag'] = np.where((df_tag['count'] > 5) & (df_tag['tag'].str.len() <= 6), df_tag['tag'], np.nan) # count>5和length<7作为初始center
# tag出现次数>5, 且tag文字长度<6
df_tag.to_csv('tags.csv')

In [25]:
data = df_tag
data

Unnamed: 0,tag,count,filter_tag
0,宽容和包容,3,
1,忠诚与正义,1,
2,家族矛盾,1,
3,反腐斗争,8,反腐斗争
4,人与自然的关系,22,
...,...,...,...
422,乡村文化,1,
423,道德与选择,2,
424,军事题材,1,
425,政治阴谋,1,


In [26]:
initial_cluster_centers_tags = data['filter_tag'].dropna().unique()
initial_cluster_centers_embeddings = np.array([theme_embeddings_dict[tag] for tag in initial_cluster_centers_tags])

# 使用Filtered Tags作为center进行Initial Clustering
num_initial_clusters = len(initial_cluster_centers_embeddings)
kmeans = KMeans(n_clusters=num_initial_clusters, init=initial_cluster_centers_embeddings, n_init=1, max_iter=300)
all_tags = data['tag'].unique()
all_embeddings = np.array([theme_embeddings_dict[tag] for tag in all_tags if tag in theme_embeddings_dict])
kmeans.fit(all_embeddings)

labels = kmeans.labels_
tag_cluster_mapping = pd.DataFrame({'tag': all_tags, 'cluster': labels})
tag_cluster_mapping  # Show the first few rows to verify the mapping

Unnamed: 0,tag,cluster
0,宽容和包容,122
1,忠诚与正义,94
2,家族矛盾,101
3,反腐斗争,0
4,人与自然的关系,80
...,...,...
422,乡村文化,33
423,道德与选择,63
424,军事题材,85
425,政治阴谋,75


In [None]:
# # calculate distances
# distances = cdist(all_embeddings, kmeans.cluster_centers_[labels], metric='euclidean').diagonal()
# # map tags with distances
# tag_cluster_mapping['distance_to_center'] = distances
# # get threshold base on distances
# threshold = np.percentile(distances, 90) # set threshold at the 95th percentile of distances
# # map new clusters with distance
# tags_for_new_clusters = tag_cluster_mapping[tag_cluster_mapping['distance_to_center'] > threshold]
# tags_for_new_clusters

In [27]:
def create_new_clusters(tags_df, embeddings, threshold):
    new_cluster_centers = [] # new cluster centers
    tags_df_updated = tags_df.copy() # Copy the DataFrame

    # Continue until there are no tags above the threshold
    while tags_df_updated['distance_to_center'].max() > threshold:
        # Find tags with the maximum distance to its cluster center
        max_distance_tag = tags_df_updated.loc[tags_df_updated['distance_to_center'].idxmax()]

        # Create a new cluster center using the tags's embedding
        new_cluster_center = embeddings[tags_df['tag'] == max_distance_tag['tag']][0]
        new_cluster_centers.append(new_cluster_center)

        # Recalculate all distances from tags to new centers
        all_cluster_centers = np.vstack([kmeans.cluster_centers_, new_cluster_centers])
        all_distances = cdist(embeddings, all_cluster_centers, metric='euclidean')

        # Find the nearest cluster center for each tag
        new_labels = np.argmin(all_distances, axis=1)
        new_distances_to_center = all_distances[np.arange(all_distances.shape[0]), new_labels]

        # Update the DataFrame with new labels and distances
        tags_df_updated['cluster'] = new_labels
        tags_df_updated['distance_to_center'] = new_distances_to_center

    return tags_df_updated, new_cluster_centers


def get_reclustering(mapping, p):
    # calculate distances
    distances = cdist(all_embeddings, kmeans.cluster_centers_[labels], metric='euclidean').diagonal()
    # map tags with distances
    mapping['distance_to_center'] = distances
    # get threshold base on distances
    threshold = np.percentile(distances, p) # set threshold at the 95th percentile of distances
    # map new clusters with distance
    tags_for_new_clusters = tag_cluster_mapping[tag_cluster_mapping['distance_to_center'] > threshold]
    # Run the custom iterative clustering process
    tag_cluster_mapping_updated, _ = create_new_clusters(mapping, all_embeddings, threshold)
    
    return tag_cluster_mapping_updated

current_df = get_reclustering(tag_cluster_mapping, 50)
current_df2 = current_df.copy()
current_df2.columns = ['tag', 'new_cluster', 'new_distance']
tag_cluster_mapping = pd.merge(tag_cluster_mapping, current_df2, on='tag')

tag_cluster_mapping


Unnamed: 0,tag,cluster,distance_to_center,new_cluster,new_distance
0,宽容和包容,122,0.181514,122,0.181514
1,忠诚与正义,94,0.260433,94,0.260433
2,家族矛盾,101,0.202677,101,0.202677
3,反腐斗争,0,0.223768,0,0.223768
4,人与自然的关系,80,0.267835,80,0.267835
...,...,...,...,...,...
422,乡村文化,33,0.284854,318,0.000000
423,道德与选择,63,0.206565,63,0.206565
424,军事题材,85,0.297091,295,0.000000
425,政治阴谋,75,0.239292,75,0.239292


In [28]:
data = tag_cluster_mapping

new_centers_indices = data[data['new_distance'] == 0.0].index
new_centers_tags = data.loc[new_centers_indices, 'tag']
new_centers_embeddings = np.array([theme_embeddings_dict[tag] for tag in new_centers_tags])

# Re-run KMeans with the new cluster centers
kmeans = KMeans(n_clusters=len(new_centers_embeddings), init=new_centers_embeddings, n_init=1)
all_embeddings = np.array([theme_embeddings_dict[tag] for tag in data['tag']])
kmeans.fit(all_embeddings)

# Assign new clusters and calculate distances
data['re_cluster'] = kmeans.labels_
data['re_distance_to_center'] = [np.min(np.linalg.norm(embedding - kmeans.cluster_centers_, axis=1)) for embedding in all_embeddings]



In [29]:
data

Unnamed: 0,tag,cluster,distance_to_center,new_cluster,new_distance,re_cluster,re_distance_to_center
0,宽容和包容,122,0.181514,122,0.181514,196,2.629895e-01
1,忠诚与正义,94,0.260433,94,0.260433,3,2.447959e-01
2,家族矛盾,101,0.202677,101,0.202677,165,2.279138e-01
3,反腐斗争,0,0.223768,0,0.223768,185,2.820302e-01
4,人与自然的关系,80,0.267835,80,0.267835,104,1.891462e-01
...,...,...,...,...,...,...,...
422,乡村文化,33,0.284854,318,0.000000,210,2.189964e-17
423,道德与选择,63,0.206565,63,0.206565,195,2.784742e-01
424,军事题材,85,0.297091,295,0.000000,211,2.849175e-01
425,政治阴谋,75,0.239292,75,0.239292,103,2.571449e-01


In [22]:
data.sort_values('re_cluster').to_excel('data_emomap_95p.xlsx', index=False)

In [30]:
! python3 theme_cluster.py

       tag cluster
1    控制与解脱       0
2     警匪对决       1
210  生死与和平       2
245  生命与死亡       2
412  生死与牺牲       2
..     ...     ...
75   调查与解谜     208
420  智慧与谜题     208
421   抗日救国     209
423   金钱欲望     210
426   时间错乱     211

[427 rows x 2 columns]
