向量化

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import torch

df_cluster=pd.read_csv('list.csv')
event_list = df_cluster['事件名称'].tolist()
print(df_cluster['事件名称'])
print(event_list)
print(len(event_list))

model_path = 'models/bge-large-zh-v1.5'#改成你的模型路径
tokenizer = BertTokenizer.from_pretrained(model_path)
model_config = BertConfig.from_pretrained(model_path)
model_config.output_hidden_states = True
model_config.output_attentions = True
bert_model = BertModel.from_pretrained(model_path, config=model_config)

input_ids = tokenizer(event_list, padding=True, truncation=True, return_tensors='pt')['input_ids']

with torch.no_grad():
    outputs = bert_model(input_ids=input_ids)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

实现层次聚类

In [None]:
CLUSTER_LEVELS = [2000, 500, 100, 20, 2]
cluster_data = pd.DataFrame({'事件名称': event_list, '向量': list(embeddings)})

iteration = 0  
for current_k in CLUSTER_LEVELS:
    print(f"正在执行第 {iteration} 层聚类，K={current_k}")

    vectors = np.vstack(cluster_data['向量'].values)
    kmeans = KMeans(n_clusters=current_k, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(vectors)
    cluster_data['聚类类别'] = cluster_labels

    file_name = f"cluster_level_{iteration}.csv"
    cluster_centers = kmeans.cluster_centers_
    cluster_data['向量'] = cluster_data['聚类类别'].map(lambda c: cluster_centers[c])

    iteration += 1 

In [None]:
from mpl_toolkits.mplot3d import Axes3D
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings)
fig = plt.figure(figsize=(18, 16))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
    reduced_embeddings[:, 0], 
    reduced_embeddings[:, 1], 
    reduced_embeddings[:, 2], 
    c=cluster_labels, 
    cmap='viridis', 
    alpha=0.7
)
plt.colorbar(scatter, label="Cluster Labels")
ax.set_title('BERT Embeddings - KMeans Clustering')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
plt.show()

In [None]:
import pandas as pd
for i in range(5):
    file_path = f"cluster_level_{i}.csv"
    df = pd.read_csv(file_path)
    agg_df = df.groupby('聚类类别')['事件名称'].apply(lambda x: ', '.join(x)).reset_index()
    agg_df.rename(columns={'聚类类别': '主题', '事件名称': '主题词'}, inplace=True)
    agg_df['主题'] = agg_df['主题'].astype(str)
    agg_df = agg_df.sort_values(by='主题')
    agg_df['主题'] = agg_df['主题'].astype(int)
    agg_df = agg_df.sort_values(by='主题')

    output_path = f"{i}_cluster_summary.csv"
    agg_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    print("转换完成，结果已保存至:", output_path)


为每一个类别加标签

In [None]:
from zhipuai import ZhipuAI
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
from openai import OpenAI
count=500
client = ZhipuAI(api_key="Your_key")  # 替换为你的API密钥
def generate_event(i, content):

    content_str = ', '.join(content) if isinstance(content, list) else str(content)
    try:
        response = client.chat.completions.create(
            model="GLM-4-Air",
            messages=[
                {"role": "user", "content": f"""给出以下社会活动或气象活动事件的共同主题，只给出一个主题：{content_str}。要求输出主题标签，主题标签也是事件名称，只输出一个标签，字数要求少于10个字，不要解释，不要有特殊字符
                 示例：
                 输入："滑雪场客流量大, 部分桥区节点行驶缓慢, 景区被评为5A级景区, 城区环路、联络线交通流量大, 医院是交通热点, 景点周边大车流, 游客不文明拍照行为, 清明小长假游客增多, 看玻璃栈道游客车辆拥堵, 黄牛炒卖门票, 部分商圈周边车流量大"
                 输出："游客增多"""}
            ],
        )
        output_text = response.choices[0].message.content  
        print(output_text)
        return i, output_text
    except Exception as e:
        print(f"Error processing index {i}: {e}")
        return i, None

# 读取输入数据
input_path = f'clusters_label_{count}.csv'
output_path = f'clusters_label_{count}.csv'

df_label = pd.read_csv(input_path)
event_list = df_label['主题词'].tolist()
    
results = []  
with ThreadPoolExecutor(max_workers=1) as executor:
    futures = [
        executor.submit(generate_event, idx, content) for idx, content in zip(df_label.index, event_list)
    ]
    for future in as_completed(futures):
        idx, output_text = future.result()
        if output_text is not None:
            results.append({'index': idx, 'label': output_text})

            results_df = pd.DataFrame(results)
            results_df.to_csv(output_path, index=False)

results_df = pd.DataFrame(results)
df_label = df_label.merge(results_df, left_index=True, right_on='index', how='left')
df_label = df_label[['数量', 'label_id', 'label', '主题词', 'index']]
df_label.to_csv(output_path, index=False)

In [None]:
import pandas as pd

base_path = "cluster/"
file_names = [f"cluster_level_{i}.csv" for i in range(5)] 
output_file = base_path + "merged_clusters.csv"  
df_label=pd.read_csv('clusters_label_20.csv')
df_label_=pd.read_csv('cluster/clusters_label_100.csv')
df_label_500=pd.read_csv('cluster/clusters_label_500.csv')

dfs = []
for i, file_name in enumerate(file_names, start=1):
    file_path = base_path + file_name
    df = pd.read_csv(file_path) 
    df = df.rename(columns={"聚类类别": f"聚类类别{i}"}) 
    dfs.append(df)

merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on="事件名称", how="inner") 

merged_df['聚类类别5'] = merged_df['聚类类别5'].map({0: "社会活动事件", 1: "气象活动事件"})
for i in range(merged_df.shape[0]):
    merged_df['聚类类别4'][i] = df_label['label'][df_label['label_id'] == merged_df['聚类类别4'][i]].values[0]
for i in range(merged_df.shape[0]):
    merged_df['聚类类别3'][i] = df_label_['label'][df_label_['label_id'] == merged_df['聚类类别3'][i]].values[0]
for i in range(merged_df.shape[0]):
    merged_df['聚类类别2'][i] = df_label_500['label'][df_label_500['label_id'] == merged_df['聚类类别2'][i]].values[0]
# 保存最终合并的文件
merged_df.to_csv(output_file, index=False)

print(f"合并完成，文件已保存至 {output_file}")


计算每类事件的概率

In [None]:
import pandas as pd
count=20
df = pd.read_csv(f'cluster/news_lable_{count}.csv')
event_time_counts = df.groupby('标签')['时间'].nunique().reset_index()
event_time_counts = df_label.merge(event_time_counts, left_on='label', right_on='标签', how='left')
event_time_counts.rename(columns={'时间': '数量'}, inplace=True)
event_time_counts = event_time_counts.sort_values(by='数量', ascending=False)
event_time_counts = event_time_counts[['数量', 'label', '主题词', '标签']]
event_time_counts.to_csv(f'cluster/clusters_label_{count}.csv', index=False, encoding='utf-8-sig')
print("统计结果已保存")

In [None]:
import pandas as pd
import itertools
count=20
df_lable = pd.read_csv(f'cluster/news_lable_{count}.csv')

df_lable = df_lable.dropna(subset=['标签'])
unique_times = df_lable['时间'].unique()
unique_events = df_lable['标签'].unique()

time_event_pairs = list(itertools.product(unique_times, unique_events))

df_time_event = pd.DataFrame(time_event_pairs, columns=['时间', '标签'])
event_counts = df_lable.groupby(['时间', '标签']).size().reset_index(name='事件数量')
df_time_event = df_time_event.merge(event_counts, on=['时间', '标签'], how='left')

df_time_event['事件数量'] = df_time_event['事件数量'].fillna(0)

pivot_table = df_time_event.pivot(index='时间', columns='标签', values='事件数量')

print(pivot_table)
pivot_table.to_csv(f'cluster/hourly_{count}.csv')
print(pivot_table.shape)


In [None]:
import pandas as pd
count=20
df_time=pd.read_csv(f'cluster/hourly_{count}.csv')
df_qx = pd.read_csv('获取的气象数据.csv')#改为你的气象数据文件路径

df_time['date'] = pd.to_datetime(
    df_time['date'].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False),
    errors='coerce'
)
df_time = df_time.dropna(subset=['date']).copy()

event_cols = df_time.columns[1:]
df_time[event_cols] = df_time[event_cols].clip(upper=1.0)


print(df_qx.columns)
df_qx['date'] = pd.to_datetime(df_qx['date'], errors='coerce')
df_qx = df_qx.dropna(subset=['date']).copy()


df_qx['date_key'] = df_qx['date'].dt.normalize() 


df_time = df_time.set_index('date')


merged = df_qx.merge(
    df_time,
    left_on='date_key',
    right_index=True,
    how='left',
    suffixes=('', '_event')
)

merged[event_cols] = merged[event_cols].fillna(0)


merged.drop(columns=['date_key'], inplace=True)


output_path = f'cluster/hourly_{count}.csv'
merged.to_csv(output_path, index=False)
print(len(merged.columns))
print(f"合并后的数据已保存到 {output_path}")