In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载SBERT模型
model_path = '/root/data/NewsAthm/sentence-transformers/distiluse-base-multilingual-cased-v2'
# model_path = 'distiluse-base-multilingual-cased-v2'
sbert_model = SentenceTransformer(model_path)


# 加载数据
data = pd.read_csv('Data231202-231211.csv')

# 将日期转换为日期时间格式
data['pub_time'] = pd.to_datetime(data['pub_time'])

# 获取唯一日期列表
dates = data['pub_time'].dt.date.unique()

# 设置阈值
threshold = 0.8

# 定义簇列表
clusters = []

# 定义聚类中心更新函数
def update_cluster_center(cluster):
    cluster_embeddings = sbert_model.encode(cluster)
    return np.mean(cluster_embeddings, axis=0)

# 对于每个日期
for date in dates:
    # 获取该日期的新闻数据
    news_data = data[data['pub_time'].dt.date == date]['body'].tolist()
    
    # 使用SBERT模型获取语义向量
    embeddings = sbert_model.encode(news_data)
    
    # 对于每个新闻数据
    for i, embedding in enumerate(embeddings):
        # 如果簇列表为空，则新开一个簇
        if not clusters:
            clusters.append({'center': embedding, 'members': [news_data[i]]})
            continue
        
        # 计算当前数据点与各个簇中心的相似度
        similarities = [cosine_similarity([embedding], [cluster['center']])[0][0] for cluster in clusters]
        
        # 找到最大相似度及其对应的簇索引
        max_similarity = max(similarities)
        max_index = similarities.index(max_similarity)
        
        # 如果最大相似度大于阈值，则将当前数据点加入对应簇，并更新簇中心
        if max_similarity > threshold:
            clusters[max_index]['members'].append(news_data[i])
            clusters[max_index]['center'] = update_cluster_center(clusters[max_index]['members'])
        # 否则新开一个簇
        else:
            clusters.append({'center': embedding, 'members': [news_data[i]]})

# 输出聚类结果
for i, cluster in enumerate(clusters):
    print(f"Cluster {i + 1}:")
    print(f"Number of news articles: {len(cluster['members'])}")
    print("News articles:")
    for news_article in cluster['members']:
        print(news_article)
    print()


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载SBERT模型
model_path = '/root/data/NewsAthm/sentence-transformers/distiluse-base-multilingual-cased-v2'
# model_path = 'distiluse-base-multilingual-cased-v2'
sbert_model = SentenceTransformer(model_path)


# 加载数据
data = pd.read_csv('Data231202-231211.csv')

# 将日期转换为日期时间格式
data['pub_time'] = pd.to_datetime(data['pub_time'])

# 获取唯一日期列表
dates = data['pub_time'].dt.date.unique()

# 设置阈值
threshold = 0.8

# 定义簇列表
clusters = []

# 定义聚类中心更新函数
def update_cluster_center(cluster):
    cluster_embeddings = sbert_model.encode(cluster)
    return np.mean(cluster_embeddings, axis=0)

# 定义写入文件函数
def write_to_file(file_path, clusters):
    with open(file_path, 'w') as file:
        for cluster_info in clusters:
            file.write(f"Cluster {cluster_info['date']}:\n")
            file.write(f"Number of clusters: {len(cluster_info['clusters'])}\n")
            for i, cluster in enumerate(cluster_info['clusters']):
                file.write(f"Cluster {i + 1}:\n")
                file.write(f"Number of news articles: {len(cluster['members'])}\n")
                file.write("News articles:\n")
                for news_article in cluster['members']:
                    file.write(news_article + '\n')
                file.write('\n')

# 对于每个日期
cluster_results = []
for date in dates:
    # 获取该日期的新闻数据
    news_data = data[data['pub_time'].dt.date == date]['body'].tolist()
    
    # 使用SBERT模型获取语义向量
    embeddings = sbert_model.encode(news_data)
    
    # 定义当天的簇列表
    daily_clusters = []
    
    # 对于每个新闻数据
    for i, embedding in enumerate(embeddings):
        # 如果簇列表为空，则新开一个簇
        if not daily_clusters:
            daily_clusters.append({'center': embedding, 'members': [news_data[i]]})
            continue
        
        # 计算当前数据点与各个簇中心的相似度
        similarities = [cosine_similarity([embedding], [cluster['center']])[0][0] for cluster in daily_clusters]
        
        # 找到最大相似度及其对应的簇索引
        max_similarity = max(similarities)
        max_index = similarities.index(max_similarity)
        
        # 如果最大相似度大于阈值，则将当前数据点加入对应簇，并更新簇中心
        if max_similarity > threshold:
            daily_clusters[max_index]['members'].append(news_data[i])
            daily_clusters[max_index]['center'] = update_cluster_center(daily_clusters[max_index]['members'])
        # 否则新开一个簇
        else:
            daily_clusters.append({'center': embedding, 'members': [news_data[i]]})
    
    # 将当天的簇信息添加到结果列表中
    cluster_results.append({'date': date, 'clusters': daily_clusters})

# 将聚类结果写入到新文件中
write_to_file('cluster_results.txt', cluster_results)
