In [None]:
import os

# 设置环境变量
os.environ['XMNLP_MODEL'] = 'xmnlp-onnx-models'

# 确认环境变量已设置
print(os.environ['XMNLP_MODEL'])

In [None]:
# MLP训练模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from geopy.distance import geodesic
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 读取数据
shurupoi = pd.read_csv('shurupoi.csv')
biaozhupoi = pd.read_csv('biaozhupoi.csv')

# 确保shurupoi DataFrame中ID的唯一性
assert shurupoi['id'].is_unique

# 定义距离计算函数
def calculate_distance(p1, p2):
    return (3000 - geodesic((p1['lat'], p1['lng']), (p2['lat'], p2['lng'])).meters) / 3000

# 定义类别相似度计算函数
def calculate_category_similarity(p1, p2):
    if p1['t'] == p2['t'] and p1['c'] == p2['c']:
        return 1
    elif p1['t'] == p2['t']:
        return 0.7
    else:
        return 0.3

# 定义名称相似度计算函数
def calculate_name_similarity(p1, p2):
    # 这里需要确保name是字符串类型，如果它们是列表，则应转换为字符串再计算
    intersection = len(set(p1['name']) & set(p2['name']))
    union = len(set(p1['name']) | set(p2['name']))
    fiu = intersection / union if union > 0 else 0
    return fiu

# 定义评论数量相似度计算函数
def calculate_comment_similarity(p1, p2):
    return (2 * min(p1['q'], p2['q'])) / (p1['q'] + p2['q']) if p1['q'] + p2['q'] > 0 else 0

class PoiDataset(Dataset):
    def __init__(self, shurupoi, biaozhupoi):
        self.features = []
        self.labels = []
        
        # 添加排序优先级字段
        biaozhupoi['sort_priority'] = biaozhupoi.groupby('id').cumcount()

        # 预处理数据
        for index, row in biaozhupoi.iterrows():
            p1 = shurupoi[shurupoi['id'] == row['id']]
            p2 = shurupoi[shurupoi['id'] == row['pid']]
            
            assert len(p1) == 1, "There should be exactly one match for ID."
            assert len(p2) == 1, "There should be exactly one match for PID."
            
            p1 = p1.iloc[0]
            p2 = p2.iloc[0]
            
            distance = calculate_distance(p1, p2)
            category_similarity = calculate_category_similarity(p1, p2)
            name_similarity = calculate_name_similarity(p1, p2)
            comment_similarity = calculate_comment_similarity(p1, p2)
            label = row['ss']
            sort_priority = row['sort_priority']  # 新增排序优先级
            
            # 对相同ss值的样本添加微小的噪声，以反映排序优先级
            if sort_priority != 0:
                # 例如，添加一个极小的负数，使得排序靠后的PID有略低的相似度
                name_similarity += -0.001 * sort_priority
                
            self.features.append([distance, category_similarity, name_similarity, comment_similarity, sort_priority])
            self.labels.append(label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# 创建数据集和数据加载器
dataset = PoiDataset(shurupoi, biaozhupoi)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 定义多层感知器模型
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(5, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # 修改输出层，使其输出一个连续值

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 初始化模型、损失函数和优化器
model = MLP()
criterion = nn.MSELoss()  # 更换损失函数为MSE
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 评估
def evaluate(model, dataloader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs).view(-1)  # 确保输出形状与标签形状一致
            y_true.extend(labels.tolist())
            y_pred.extend(outputs.tolist())

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    return mse, mae

# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs).view(-1)  # 确保输出形状与标签形状一致
        loss = criterion(outputs, labels.float())  # 确保标签类型为float
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

# 在训练结束后评估模型
mse, mae = evaluate(model, dataloader)
print(f'Mean Squared Error: {mse:.4f}, Mean Absolute Error: {mae:.4f}')

In [None]:
# 景点类选择KNN图示
# 不去重和近义词
import pandas as pd
import torch
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from word_similarity import WordSimilarity2010
import matplotlib.pyplot as plt
import os

# 定义一个清理名称的函数，移除括号及其内容
def clean_name(name):
    # 使用正则表达式移除括号及其内容
    return re.sub(r'\(.*\)', '', name).strip()

# 修改后的名称相似度计算函数
def calculate_name_similarity(p1, p2):
    # 清理两个POI的名称
    cleaned_name1 = clean_name(p1['name'])
    cleaned_name2 = clean_name(p2['name'])
    
    # 现在使用清理后的名称进行比较
    intersection = len(set(cleaned_name1) & set(cleaned_name2))
    union = len(set(cleaned_name1) | set(cleaned_name2))
    fiu = intersection / union if union > 0 else 0
    return fiu

# 其他代码保持不变...
# 读取shurupoi1.csv文件
shurupoi1 = pd.read_csv('shurupoi5.csv')

# 定义获取指定ID的POI与shurupoi1.csv中其他POI的相似度的函数
def get_similar_pois_within_shurupoi1(model, poi_id):
    # 获取输入poi的信息
    input_poi = shurupoi1.loc[shurupoi1['id'] == poi_id].iloc[0]
    
    # 初始化相似度分数列表
    similarity_scores = []
    
    # 遍历shurupoi1.csv中的所有poi，计算相似度分数
    for idx, row in shurupoi1.iterrows():
        if row['id'] != poi_id:  # 确保不计算与自身之间的相似度
            p2 = row
            distance = calculate_distance(input_poi, p2)
            category_similarity = calculate_category_similarity(input_poi, p2)
            name_similarity = calculate_name_similarity(input_poi, p2)  # 使用修改后的函数
            comment_similarity = calculate_comment_similarity(input_poi, p2)
            # 添加排序优先级，默认为0，因为我们在此阶段没有实际的排序优先级
            sort_priority = 0
            # 构建包含所有特征的输入张量
            input_tensor = torch.tensor([distance, category_similarity, name_similarity, comment_similarity, sort_priority], dtype=torch.float32)
            # 使用模型预测相似度
            output = model(input_tensor)
            # 直接使用模型的输出作为相似度分数，注意模型输出应该是一个标量
            similarity_score = output.item()
            similarity_scores.append((row['id'], distance, category_similarity, name_similarity, comment_similarity, similarity_score))
    
    # 根据相似度分数降序排序
    similarity_scores.sort(key=lambda x: x[5], reverse=True)
    
    return similarity_scores

# 示例：使用模型获取shurupoi1.csv中poi_id为23的poi与其他poi的相似度
poi_id = 11
similar_pois = get_similar_pois_within_shurupoi1(model, poi_id)

for poi in similar_pois[:15]:  # 注意这里我改回了只打印前5个最相似的POI
# for poi in similar_pois[:10]:  # 打印前10个最相似的POI
    print(f"Poi ID: {poi[0]}, Distance: {poi[1]:.2f}, Category Similarity: {poi[2]:.2f}, Name Similarity: {poi[3]:.2f}, Comment Similarity: {poi[4]:.2f}, Final Score: {poi[5]:.2f}")

# 获取与poi_id=23最相似的前10个POI的ID
top_poi_ids = [poi[0] for poi in similar_pois[:10]]

# 读取这些POI的.csv文件，提取第二列，不再去除重复和近义词
aspect_sets = []

for id in top_poi_ids:
    filepath = f"ASTE5/{id}.csv"
    df = pd.read_csv(filepath, dtype=str)  # 明确指定所有数据为字符串类型
    # 过滤掉空的方面词，确保只保留非空字符串
    aspects = df.iloc[:, 1].dropna().tolist()
    aspect_sets.append(aspects)

# 读取poi_id=23的.csv文件，提取第二列，也不再去除重复和近义词
target_filepath = f"ASTE5/{poi_id}.csv"
target_df = pd.read_csv(target_filepath, dtype=str)  # 明确指定所有数据为字符串类型
# 过滤掉空的方面词
aspects = target_df.iloc[:, 1].dropna().tolist()
target_aspects = aspects

# 计算每个POI与目标POI的SIP值
sips = []
for i, aspect_set in enumerate(aspect_sets):
    intersection = len(set(aspect_set) & set(target_aspects))
    union = len(set(aspect_set) | set(target_aspects))
    sip = intersection / union if union > 0 else 0
    sips.append((top_poi_ids[i], sip))

# 按照SIP值排序
sips.sort(key=lambda x: x[1], reverse=True)

# 输出SIP值
for id, sip in sips:
    print(f"ID: {id}, SIP: {sip}")

# 使用KNN方法计算与poi_id=23的POI最相似的k个POI的SIP值
# 首先按SIP值降序排序sips
sips.sort(key=lambda x: x[1], reverse=True)

# 存储K值与对应的SIP值
k_sips = []

# 遍历k值从1到10
for k in range(1, 11):
    # 当前的方面词集合初始化为空
    current_aspect_set = []
    
    # 将当前SIP值最高的前k个POI的方面词加入到当前集合
    for i in range(k):
        top_poi_id, _ = sips[i]
        index = top_poi_ids.index(top_poi_id)
        current_aspect_set.extend(aspect_sets[index])
    
    # 计算SIP(pi,P) = |A_i∩A|/|A_i∪A|
    intersection = len(set(current_aspect_set) & set(target_aspects))
    union = len(set(current_aspect_set) | set(target_aspects))
    sip = intersection / union if union > 0 else 0
    k_sips.append((k, sip))

# 寻找SIP值最大的K
max_k, max_sip = max(k_sips, key=lambda x: x[1])

# 输出SIP值最高的K值及对应的POI ID
print(f"The highest SIP value is {max_sip} at K={max_k}")
print(f"The POI IDs corresponding to K={max_k} are:")
for i in range(max_k):
    top_poi_id, _ = sips[i]
    print(top_poi_id)

# 绘制k取值从1到10时的SIP变化曲线
k_values, sip_values = zip(*k_sips)
plt.plot(k_values, sip_values)
plt.xlabel("K")
plt.ylabel("SIP")
plt.title(f"SIP Changes with K (Ordered by SIP) for POI ID: {poi_id}")
plt.show()

In [None]:
#景点类计算分数示例
import pandas as pd
import torch
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from word_similarity import WordSimilarity2010
import matplotlib.pyplot as plt
import os
import numpy as np
import xmnlp
from xmnlp.sv import SentenceVector
import time

# 定义一个清理名称的函数，移除括号及其内容
def clean_name(name):
    # 使用正则表达式移除括号及其内容
    return re.sub(r'\(.*\)', '', name).strip()

# 修改后的名称相似度计算函数
def calculate_name_similarity(p1, p2):
    # 清理两个POI的名称
    cleaned_name1 = clean_name(p1['name'])
    cleaned_name2 = clean_name(p2['name'])
    
    # 现在使用清理后的名称进行比较
    intersection = len(set(cleaned_name1) & set(cleaned_name2))
    union = len(set(cleaned_name1) | set(cleaned_name2))
    fiu = intersection / union if union > 0 else 0
    return fiu

# 其他代码保持不变...
# 读取shurupoi1.csv文件
shurupoi1 = pd.read_csv('shurupoi5.csv')

# 定义获取指定ID的POI与shurupoi1.csv中其他POI的相似度的函数
def get_similar_pois_within_shurupoi1(model, poi_id):
    # 获取输入poi的信息
    input_poi = shurupoi1.loc[shurupoi1['id'] == poi_id].iloc[0]
    
    # 初始化相似度分数列表
    similarity_scores = []
    
    # 遍历shurupoi1.csv中的所有poi，计算相似度分数
    for idx, row in shurupoi1.iterrows():
        if row['id'] != poi_id:  # 确保不计算与自身之间的相似度
            p2 = row
            distance = calculate_distance(input_poi, p2)
            category_similarity = calculate_category_similarity(input_poi, p2)
            name_similarity = calculate_name_similarity(input_poi, p2)  # 使用修改后的函数
            comment_similarity = calculate_comment_similarity(input_poi, p2)
            # 添加排序优先级，默认为0，因为我们在此阶段没有实际的排序优先级
            sort_priority = 0
            # 构建包含所有特征的输入张量
            input_tensor = torch.tensor([distance, category_similarity, name_similarity, comment_similarity, sort_priority], dtype=torch.float32)
            # 使用模型预测相似度
            output = model(input_tensor)
            # 直接使用模型的输出作为相似度分数，注意模型输出应该是一个标量
            similarity_score = output.item()
            similarity_scores.append((row['id'], distance, category_similarity, name_similarity, comment_similarity, similarity_score))
    
    # 根据相似度分数降序排序
    similarity_scores.sort(key=lambda x: x[5], reverse=True)
    
    return similarity_scores

# 用于情感分析的函数
def sentiment_analysis(text):
    # 使用xmnlp进行情感分析
    return xmnlp.sentiment(text)

# 计算调整后的相似度的函数
def adjusted_similarity(sv, query, doc):
    # 计算原始相似度
    raw_similarities = sv.similarity(query, doc)
    
    # 检查是否是单一数值
    if isinstance(raw_similarities, np.ndarray) and raw_similarities.size > 1:
        # 如果是数组，我们取最大相似度进行判断
        max_similarity = np.max(raw_similarities)
    else:
        max_similarity = raw_similarities
    
    # 只有当最大相似度超过一定阈值时才进行情感分析
    if max_similarity > 0.6:
        # 进行情感分析
        query_sentiment = sentiment_analysis(query)
        doc_sentiment = sentiment_analysis(doc)
        
        # 情感分析输出格式为(负向概率, 正向概率)
        query_neg_prob = query_sentiment[0]
        doc_neg_prob = doc_sentiment[0]
        
        # 判断情感是否相反
        if (query_neg_prob > 0.5 and doc_neg_prob <= 0.5) or (query_neg_prob <= 0.5 and doc_neg_prob > 0.5):
            # 减少相似度
            return max_similarity - 0.6
        else:
            return max_similarity
    else:
        return max_similarity

# 创建SentenceVector实例
sv = SentenceVector(genre='通用')

# 定义一个函数来找出近义词（代表性）
def find_synonyms(word, other_words, threshold_sv=0.75):
    synonyms = []
    for other_word in other_words:
        sv_sim = sv.similarity(word, other_word)
        if np.any(sv_sim > threshold_sv):
            synonyms.append(other_word)
    return synonyms

# 定义一个函数来找出近义词（独特性）
def find_synonymsU(word, other_words, threshold_sv=0.75):
    synonyms = []
    for other_word in other_words:
        # 使用adjusted_similarity函数来计算调整后的相似度
        sv_sim = adjusted_similarity(sv, word, other_word)
        if np.any(sv_sim > threshold_sv):
            synonyms.append(other_word)
    return synonyms

# 定义目录路径
text_dir = 'chulipoi5'
csv_dir = 'ASTE5'

# 加载特定的文本文件
def load_target_text_file(filename):
    filepath = os.path.join(text_dir, filename)
    if not os.path.exists(filepath):
        print(f"File {filepath} does not exist.")
        return [], []
    sentences = []
    topics = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            sentence, topic = line.strip().split('$$$')
            sentences.append(sentence.strip())
            topics.append(int(topic.split('##')[-1]))
    return sentences, topics

# 加载特定的CSV文件
def load_target_csv_file(filename):
    filepath = os.path.join(csv_dir, filename)
    if not os.path.exists(filepath):
        print(f"File {filepath} does not exist.")
        return None
    df = pd.read_csv(filepath)
    
    return df

# 定义一个函数来处理单个POI ID
def process_poi(poi_id):
    # 读取shurupoi1.csv文件
    global shurupoi1
    if shurupoi1 is None:
        shurupoi1 = pd.read_csv('shurupoi5.csv')

    # 示例：使用模型获取shurupoi1.csv中poi_id为23的poi与其他poi的相似度
    similar_pois = get_similar_pois_within_shurupoi1(model, poi_id)
    
    for poi in similar_pois[:10]:  # 注意这里我改回了只打印前5个最相似的POI
    # for poi in similar_pois[:10]:  # 打印前10个最相似的POI
        print(f"Poi ID: {poi[0]}, Distance: {poi[1]:.2f}, Category Similarity: {poi[2]:.2f}, Name Similarity: {poi[3]:.2f}, Comment Similarity: {poi[4]:.2f}, Final Score: {poi[5]:.2f}")

    # 获取与poi_id最相似的前10个POI的ID
    top_poi_ids = [poi[0] for poi in similar_pois[:10]]

    # 读取这些POI的.csv文件，提取第二列，去除重复和近义词
    ws_tool = WordSimilarity2010()
    aspect_sets = []

    for id in top_poi_ids:
        filepath = f"ASTE5/{id}.csv"
        df = pd.read_csv(filepath, dtype=str)  # 明确指定所有数据为字符串类型
        # 过滤掉空的方面词，确保只保留非空字符串
        aspects = df.iloc[:, 1].dropna().tolist()
        
        # 去重
        unique_aspects = set(aspects)
        
        # 去除近义词
        final_aspects = []
        for aspect in unique_aspects:
            if not any(ws_tool.similarity(aspect, other) > 0.65 for other in final_aspects):
                final_aspects.append(aspect)
                
        aspect_sets.append(set(final_aspects))

    # 读取poi_id的.csv文件，提取第二列，去除重复和近义词
    target_filepath = f"ASTE5/{poi_id}.csv"
    target_df = pd.read_csv(target_filepath, dtype=str)  # 明确指定所有数据为字符串类型
    # 过滤掉空的方面词
    aspects = target_df.iloc[:, 1].dropna().tolist()

    # 去重
    unique_aspects = set(aspects)

    # 去除近义词
    final_aspects = []
    for aspect in unique_aspects:
        if not any(ws_tool.similarity(aspect, other) > 0.65 for other in final_aspects):
            final_aspects.append(aspect)
            
    target_aspects = set(final_aspects)

    # 计算每个POI与目标POI的SIP值
    sips = []
    for i, aspect_set in enumerate(aspect_sets):
        intersection = len(aspect_set & target_aspects)
        union = len(aspect_set | target_aspects)
        sip = intersection / union if union > 0 else 0
        sips.append((top_poi_ids[i], sip))

    # 按照SIP值排序
    sips.sort(key=lambda x: x[1], reverse=True)

    # 输出SIP值前4的POI ID和SIP分数
    top_sip_pois = []
    print("Top 2 POI IDs and their SIP scores:")
    for i in range(2):
        id, sip = sips[i]
        top_sip_pois.append(id)
        print(f"POI ID: {id}, SIP score: {sip:.2f}")
    
    # 保存SIP排名前4的POI ID到变量中
    top_sip_ids = top_sip_pois

    # 加载目标文本文件
    target_txt_filename = f"{poi_id}.txt"
    sentences, topics = load_target_text_file(target_txt_filename)
    
    # 加载目标CSV文件
    target_csv_filename = f"{poi_id}.csv"
    df = load_target_csv_file(target_csv_filename)
    
    # 检查是否成功加载了CSV文件
    if df is None:
        raise ValueError(f"Failed to load the CSV file for POI ID {poi_id}.")

    # 将DataFrame添加到字典中，键为CSV文件名
    dfs = {target_csv_filename: df}
    
    # 将句子和主题序号关联起来
    sentences_topics = list(zip(sentences, topics))
    
    # 计算每个主题的出现次数
    topic_counts = pd.Series(topics).value_counts()
    print("Topic counts:", topic_counts)
    # 选择出现次数最多的前五个主题
    top_topics = topic_counts.head(5).index.tolist()
    print("Top 5 topics:", top_topics)
    
    # 过滤掉非顶级主题的句子
    filtered_sentences_topics = [(sentence, topic) for sentence, topic in sentences_topics if topic in top_topics]
    
    # 计算代表性分数的函数
    def calculate_representativeness(sentences_topics, df):
        # 初始化一个字典来存储每个句子的代表性分数
        representativeness_scores = {}
    
        # 删除方面词为空的行
        df = df[~df['a'].isna() & (df['a'].str.strip() != '')]
    
        # 对于每个句子，找到对应的行，并计算代表性分数
        for index, (sentence, topic) in enumerate(sentences_topics):
            sentence_clean = sentence.split('$$$')[0].strip()  # 假设'$$$'是分割符
        
            # 句子在.txt文件中的行号（也是.csv文件中的id）
            row_num = index + 1
        
            # 在CSV文件中找到与句子行号对应的行，并且只考虑与当前句子相同主题的行
            related_rows = df[(df['id'] == row_num) & (df['ZT'] == topic)]
        
            # 初始化该句子的代表性分数为0
            rep_score = 0
        
            for _, row in related_rows.iterrows():
                # 获取该行的方面词和意见词
                aspect = row['a']
                opinion = row['o']
                
                # 只考虑与当前句子相同主题的方面词和意见词
                same_topic_rows = df[df['ZT'] == topic]
            
                # 找出意见词的近义词
                opinion_synonyms = find_synonyms(opinion, same_topic_rows['o'].unique(), threshold_sv=0.7)
            
                # 找出方面词的近义词
                aspect_synonyms = find_synonyms(aspect, same_topic_rows['a'].unique(), threshold_sv=0.7)
            
                # 初始化P(o|a)和P(o|a̅)
                p_o_given_a = 0
                p_o_given_not_a = 0
                total_synonyms_count = len(opinion_synonyms)
            
                # 对于每个近义词计算P(o|a)和P(o|a̅)，使用整个DataFrame (df) 来计算
                for synonym in opinion_synonyms:
                    # 计算P(o|a)：意见词在当前方面词及其近义词中的计数
                    count_o_given_a = len(same_topic_rows[(same_topic_rows['o'] == synonym) & (same_topic_rows['a'].isin(aspect_synonyms))])
                    p_o_given_a += count_o_given_a
                
                    # 计算P(o|a̅)：意见词不在当前方面词及其近义词中的计数
                    count_o_given_not_a = len(same_topic_rows[(same_topic_rows['o'] == synonym) & (~same_topic_rows['a'].isin(aspect_synonyms))])
                    p_o_given_not_a += count_o_given_not_a
            
                # 根据公式计算代表性分数
                rep = p_o_given_a / (0.5 + p_o_given_not_a)
                
                # 将代表性分数累加到该句子的总分中
                rep_score += rep
        
            # 将最终的代表性分数添加到句子的总分中
            representativeness_scores[sentence_clean] = rep_score
            
            # 输出最终的代表性分数
            print(f"Final REP Score for Sentence {index + 1}: {representativeness_scores[sentence_clean]}")
    
        # 找到所有句子的代表性分数中的最大值
        max_rep_score = max(representativeness_scores.values())
    
        # 归一化代表性分数
        normalized_representativeness_scores = {sentence: score / max_rep_score for sentence, score in representativeness_scores.items()}
                
        return normalized_representativeness_scores
    
    # 计算句子的SS值
    def calculate_ss(sentences_topics):
        ss_scores = {}

        for index, (sentence, topic) in enumerate(sentences_topics):
            sentence_clean = sentence.split('$$$')[0].strip()  # 假设'$$$'是分割符
            similarity_counts = 0
            
            # 只考虑与当前句子相同主题的其他句子
            same_topic_sentences = [s for s, t in sentences_topics if t == topic]
        
            # 包括句子本身在内的相同主题的句子数量
            total_same_topic_sentences = len(same_topic_sentences)
        
            # 计算句子与其他相同主题的句子之间的相似度
            for other_sentence, _ in sentences_topics:
                if other_sentence in same_topic_sentences:
                    adj_similarity = adjusted_similarity(sv, sentence, other_sentence)
                
                    if adj_similarity > 0.6:
                        similarity_counts += 1
        
            # 计算SS值
            ss_value = similarity_counts / total_same_topic_sentences
        
            # 输出最终的SS值
            print(f"Final SS Value for Sentence {index + 1}: {ss_value}")
        
            ss_scores[sentence_clean] = ss_value

        return ss_scores

    # 计算独特性分数的函数
    def calculate_uniqueness(sentences_topics, dfs, top_sip_ids):
        uniqueness_scores = {}
        ss_scores = calculate_ss(sentences_topics)  # 先计算SS值
    
        # 合并所有top_sip_ids对应的CSV文件
        combined_df = pd.concat([load_target_csv_file(f"{top_id}.csv") for top_id in top_sip_ids], ignore_index=True)
    
        # 删除方面词为空的行
        combined_df = combined_df[~combined_df['a'].isna() & (combined_df['a'].str.strip() != '')]
    
        # 遍历每个句子
        for index, (sentence, topic) in enumerate(sentences_topics):
            sentence_clean = sentence.split('$$$')[0].strip()  # 假设'$$$'是分割符
        
            # 句子在.txt文件中的行号（也是.csv文件中的id）
            row_num = index + 1
        
            # 在CSV文件中找到与句子行号对应的行
            df = dfs[next(iter(dfs))]  # 获取字典中的第一个（也是唯一一个）CSV文件
            related_rows = df[(df['id'] == row_num) & (df['ZT'] == topic)]
        
            # 初始化该句子的独特性分数为0
            uniq_score = 0
        
            # 对于每一组数据来进行2-4的操作
            SAO_sum = 0  # 用于累积SAO的值
            for _, row in related_rows.iterrows():
                # 获取该组数据中的方面词和意见词
                aspect = row['a']
                opinion = row['o']
            
                # 查找与方面词相同或近义的词，只考虑与当前句子相同主题的行
                aspect_synonyms = find_synonymsU(aspect, combined_df[combined_df['ZT'] == topic]['a'].unique(), threshold_sv=0.7)
                
            
                # 计算SS(a): 近义方面词的出现次数
                SS_a = len(combined_df[(combined_df['a'].isin(aspect_synonyms)) & (combined_df['ZT'] == topic)])
                
            
                # 对于每个找到的方面词，计算其对应的意见词在大集合中相同或近义的出现次数
                SS_o = 0
                for syn_aspect in aspect_synonyms:
                    opinion_synonyms = find_synonymsU(opinion, combined_df[(combined_df['a'] == syn_aspect) & (combined_df['ZT'] == topic)]['o'].unique(), threshold_sv=0.7)
                    
                    SS_o += len(combined_df[(combined_df['a'] == syn_aspect) & (combined_df['o'].isin(opinion_synonyms)) & (combined_df['ZT'] == topic)])
                    
            
                # 如果SS(a)为0，则将SS(o)/SS(a)定义为0
                SAO = SS_o / SS_a if SS_a != 0 else 0
                
            
                # 累积SAO
                SAO_sum += SAO
        
            # 计算独特性分数
            uniq = ss_scores[sentence_clean] * (1 / (1 + np.log10(1 + SAO_sum)))
              
            # 输出每个句子的独特性分数
            print(f"Final Uniq Score for Sentence {index + 1}: {uniq}")
        
            # 更新独特性分数字典
            uniqueness_scores[sentence_clean] = {
                'UNQ': uniq,
                'SS': ss_scores[sentence_clean]
            }
        
        # 找到所有句子的独特性分数中的最大值
        max_uniq_score = max(score['UNQ'] for score in uniqueness_scores.values())

        # 归一化独特性分数
        normalized_uniqueness_scores = {sentence: {'UNQ': score['UNQ'] / max_uniq_score, 'SS': score['SS']} for sentence, score in uniqueness_scores.items()}

        return normalized_uniqueness_scores

    # 计算多样性分数的函数
    def calculate_diversity(sentences, dfs):
        diversity_scores = []
        # 从dfs字典中获取对应的CSV文件
        df = next(iter(dfs.values()))  # 获取字典中的第一个（也是唯一一个）CSV文件
    
        # 计算.csv文件中出现次数最多的id的出现次数
        max_count = df['id'].value_counts().max()
    
        for index, (sentence, topic) in enumerate(sentences_topics):
            sentence_clean = sentence.split('$$$')[0].strip()  # 假设'$$$'是分割符
        
            # 句子在.txt文件中的行号（也是.csv文件中的id）
            row_num = index + 1
        
            # 在CSV文件中找到与句子行号对应的行，并且只考虑与当前句子相同主题的行
            related_rows = df[(df['id'] == row_num) & (df['ZT'] == topic)]
        
            # 计算id出现的次数
            count = related_rows.shape[0]
            diversity_scores.append(count / max_count)
            
    
        return diversity_scores

    representativeness_scores = calculate_representativeness(filtered_sentences_topics, df)

    uniqueness_scores = calculate_uniqueness(filtered_sentences_topics, dfs, top_sip_ids)

    diversity_scores = calculate_diversity(filtered_sentences_topics, dfs)
    
    # 创建一个空字典来存储每个句子的四个评分
    all_scores = {}

    for index, (sentence, topic) in enumerate(filtered_sentences_topics):
        # 对每个评分进行四舍五入保留四位小数
        rep_score = round(representativeness_scores.get(sentence, 0), 4)
        unq_score = round(uniqueness_scores.get(sentence, {'UNQ': 0})['UNQ'], 4)
        div_score = round(diversity_scores[index], 4)
        
        all_scores[sentence] = {
            'ID': index + 1,
            'sentence': sentence,
            'REP': rep_score,
            'UNQ': unq_score,
            'DIV': div_score,
            'ZT': topic
        }

    return top_sip_ids, all_scores  # 返回必要的结果，以便可以在外部进行进一步处理

# 主函数
def main():
    # 定义要处理的POI ID范围
    start_poi_id = 1
    end_poi_id = 20

    # 循环处理每个POI ID
    for poi_id in range(start_poi_id, end_poi_id + 1):
        
        start_time = time.time()
        # 处理每个POI
        top_sip_ids, all_scores = process_poi(poi_id)

        # 输出或保存结果
        # 例如，你可以保存all_scores到CSV文件
        output_df = pd.DataFrame(all_scores).T
        output_df.columns = ['ID', 'sentence', 'REP', 'UNQ', 'DIV', 'ZT']

        # 对数据框进行排序
        output_df.sort_values(by=['ZT', 'ID'], inplace=True)

        # 保存到CSV文件
        output_filename = os.path.join('fenshu5', f"{poi_id}.csv")
        output_df.to_csv(output_filename, index=False)
        print(f"已完成 {poi_id}.csv")
        
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"{poi_id}.csv took {execution_time:.4f} seconds")

if __name__ == "__main__":
    main()

In [None]:
#输出摘要示例
import os
import pandas as pd
import numpy as np
from xmnlp.sv import SentenceVector

# 定义文件路径
input_folder = 'fenshu5'
aste_folder = 'ASTE5'
output_folder = 'xin5'

# 创建输出文件夹
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 定义句子相似度计算函数
def calculate_similarity(sentence1, sentence2, sv):
    # 先将句子转换为向量表示
    vec1 = sv.transform(sentence1)
    vec2 = sv.transform(sentence2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# 定义MMV计算函数
def calculate_mmv(sentence, sentences_set, sv):
    max_similarity = 0
    for s in sentences_set:
        sim = calculate_similarity(sentence, s, sv)
        if sim > max_similarity:
            max_similarity = sim
    # MMV计算公式保持不变
    mmv = 0.35 * sentence['score'] - 0.65 * max_similarity
    return mmv, 0.35 * sentence['score'], 0.65 * max_similarity

# 处理每个CSV文件
for file_number in range(1, 21):
    filename = f"{file_number}.csv"
    print(f"Processing {filename}")

    # 读取fenshu5文件夹中的文件
    fenshu_path = os.path.join(input_folder, filename)
    df_fenshu = pd.read_csv(fenshu_path)
    print(f"Read {df_fenshu.shape[0]} rows from {fenshu_path}")

    # 读取ASTE文件夹中的对应文件
    aste_path = os.path.join(aste_folder, f"{file_number}.csv")
    df_aste = pd.read_csv(aste_path)
    print(f"Read {df_aste.shape[0]} rows from {aste_path}")

    # 初始化SentenceVector对象
    sv = SentenceVector(genre='通用')

    # 计算每个句子的总体分数
    df_fenshu['score'] = 0.35 * df_fenshu['REP'] + 0.55 * df_fenshu['UNQ'] + 0.1 * df_fenshu['DIV']
    print("Calculated scores for all sentences.")

    # 计算每个主题的句子数量
    theme_counts = df_fenshu['ZT'].value_counts().sort_values(ascending=False)
    print(f"Theme counts: {theme_counts}")

    # 按照主题分数排序，每个主题下的句子按总体分数排序
    sorted_df = df_fenshu.sort_values(by=['ZT', 'score'], ascending=[True, False])
    print("Sorted DataFrame by themes and scores.")

    # 如果总句子数少于5个，则直接使用这些句子
    if sorted_df.shape[0] < 5:
        print("Less than 5 sentences in the document, using all available sentences.")
        selected_sentences = sorted_df.copy()
    else:
        # 初始化待选摘要集
        selected_sentences = []

        # 选择每个主题的最佳句子
        theme_index = 0
        while len(selected_sentences) < 5 and theme_index < len(theme_counts.index):
            theme = theme_counts.index[theme_index]
            print(f"Processing theme {theme}")
            theme_df = sorted_df[sorted_df['ZT'] == theme]

            # 从主题中选择句子
            if len(selected_sentences) == 0:
                # 第一次选择，直接选择分数最高的句子
                best_sentence = theme_df.iloc[0]
                best_sentence['mmv'] = best_sentence['score']  # 添加mmv列并初始化为score
                selected_sentences.append(best_sentence)
                print(f"Selected sentence: {best_sentence['sentence']}, score: {best_sentence['score']}")
            else:
                # 计算MMV并选择最佳句子
                mmv_scores = []
                for rank, row in theme_df.iterrows():
                    mmv, score_part, similarity_part = calculate_mmv(row, selected_sentences, sv)
                    print(f"MMV for sentence {row['sentence']}: {mmv}, score part: {score_part}, similarity part: {similarity_part}")
                    mmv_scores.append((row, mmv))

                # 获取所有MMV值
                mmv_values = [x[1] for x in mmv_scores]

                # 找到MMV的最大绝对值
                max_abs_mmv = max(abs(x) for x in mmv_values)

                # 调整所有MMV值，使其非负
                adjusted_mmv_scores = [(row, mmv + max_abs_mmv) for row, mmv in mmv_scores]

                # 选择调整后的MMV最高的句子
                if adjusted_mmv_scores:
                    best_row, best_mmv = max(adjusted_mmv_scores, key=lambda x: x[1])
                    best_row['mmv'] = best_mmv  # 添加mmv列
                    selected_sentences.append(best_row)
                    print(f"Selected sentence: {best_row['sentence']}, score: {best_row['score']}")

            # 如果已经选出了5个句子，退出循环
            if len(selected_sentences) == 5:
                break

            theme_index += 1

        # 如果主题数量不足5个，重新开始选择
        while len(selected_sentences) < 5:
            for theme_index in range(len(theme_counts.index)):
                theme = theme_counts.index[theme_index]
                print(f"Processing theme {theme} again")
                theme_df = sorted_df[sorted_df['ZT'] == theme]

                # 从主题中选择句子
                mmv_scores = []
                for rank, row in theme_df.iterrows():
                    mmv, score_part, similarity_part = calculate_mmv(row, selected_sentences, sv)
                    print(f"MMV for sentence {row['sentence']}: {mmv}, score part: {score_part}, similarity part: {similarity_part}")
                    mmv_scores.append((row, mmv))

                # 获取所有MMV值
                mmv_values = [x[1] for x in mmv_scores]

                # 找到MMV的最大绝对值
                max_abs_mmv = max(abs(x) for x in mmv_values)

                # 调整所有MMV值，使其非负
                adjusted_mmv_scores = [(row, mmv + max_abs_mmv) for row, mmv in mmv_scores]

                # 选择调整后的MMV最高的句子
                if adjusted_mmv_scores:
                    best_row, best_mmv = max(adjusted_mmv_scores, key=lambda x: x[1])
                    best_row['mmv'] = best_mmv  # 添加mmv列
                    selected_sentences.append(best_row)
                    print(f"Selected sentence: {best_row['sentence']}, score: {best_row['score']}")

                # 如果已经选出了5个句子，退出循环
                if len(selected_sentences) == 5:
                    break

    # 输出最终选择的句子
    final_sentences = [row['sentence'] for row in selected_sentences[:5]]
    print(f"Final sentences: {final_sentences}")

    # 输出最终结果
    output_filename = os.path.join(output_folder, f"{file_number}.csv")
    pd.DataFrame({'sentence': final_sentences}).to_csv(output_filename, index=False)
    print(f"Output saved to {output_filename}")

In [None]:
#实验示例
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math

# 读取数据，并记录句子和行号的映射关系
def read_data(txt_folder, csv_folder, id):
    sentences = []
    sentence_to_row = {}  # 记录句子和行号的映射
    aspects_dict = {}  # 记录id和方面词的映射
    
    txt_path = os.path.join(txt_folder, f'{id}.txt')
    csv_path = os.path.join(csv_folder, f'{id}.csv')
    
    with open(txt_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            parts = line.strip().split('$$$')
            sentence = parts[0].strip()
            sentences.append(sentence)
            sentence_to_row[sentence] = line_number

    csv_data = pd.read_csv(csv_path)
    for index, row in csv_data.iterrows():
        aspects_dict[row["id"]] = {'a': row['a'], 'ZT': row['ZT'], 'f': row['f']}  # 添加f列

    return sentences, sentence_to_row, aspects_dict

# 根据摘要句子获取对应的方面词及情感分数
def get_aspects_for_summary_sentences(chulipoi_data, aste_data, summary_sentences):
    summary_aspects = []  # 存储所有摘要句子的方面词
    sentiment_scores = []  # 存储所有摘要句子的情感分数
    
    for sentence in summary_sentences:
        row = chulipoi_data[sentence]  # 根据句子获取行号
        # 根据行号获取所有相关的方面词和情感分数
        sentiment_score = 0.5  # 初始化情感分数
        for index, row_data in aste_data.iterrows():
            if str(row_data['id']) == str(row):
                summary_aspects.append({'a': row_data['a'], 'ZT': row_data['ZT'], 'f': row_data['f']})
                sentiment_score += 0.1 if row_data['f'] == 1 else -0.1 if row_data['f'] == 0 else 0
                
        sentiment_scores.append(sentiment_score)
        
    return summary_aspects

# 计算信息覆盖率
def calculate_coverage(text, summary_text):
    vectorizer = TfidfVectorizer()
    text_tfidf = vectorizer.fit_transform([text])
    summary_tfidf = vectorizer.transform([summary_text])
    
    similarity_matrix = cosine_similarity(summary_tfidf, text_tfidf)
    coverage = max(similarity_matrix[0])
    return coverage

# 计算信息多样性
def calculate_diversity(summary_aspects):
    # 分组方面词按主题（ZT）
    aspect_counts_by_ZT = {}
    for aspect in summary_aspects:
        ZT = aspect['ZT']
        a = aspect['a']
        # 跳过空的方面词
        if pd.isna(a):
            continue
        
        if ZT not in aspect_counts_by_ZT:
            aspect_counts_by_ZT[ZT] = {}
        if a not in aspect_counts_by_ZT[ZT]:
            aspect_counts_by_ZT[ZT][a] = 0
        aspect_counts_by_ZT[ZT][a] += 1


    # 获取每个主题下所有方面词的总数
    all_aspect_counts_by_ZT = {}
    for ZT, aspects in aspect_counts_by_ZT.items():
        total_aspects = sum(aspects.values())
        all_aspect_counts_by_ZT[ZT] = total_aspects


    total_diversity = 0.0
    for ZT, aspects in aspect_counts_by_ZT.items():
        total_aspects = all_aspect_counts_by_ZT[ZT]
        if total_aspects > 0:
            # 计算每个方面词出现的概率
            probabilities = [count / total_aspects for count in aspects.values()]

            # 使用信息熵公式计算多样性
            diversity = -sum([(count + 1) / total_aspects * math.log((count + 0.1) / total_aspects) for count in aspects.values()])
            total_diversity += diversity * 0.2
    return total_diversity

# 主程序
txt_folder = 'chulipoi5'
csv_folder = 'ASTE5'
summary_folder = 'zhaiyao5'

# 初始化累加器
total_coverage = 0.0
total_diversity = 0.0
num_files = 0

for i in range(1, 21):
    # 读取数据
    sentences, sentence_to_row, aspects_dict = read_data(txt_folder, csv_folder, i)

    # 将所有句子连接成一个文本
    text = " ".join(sentences)
    
    # 读取摘要句子
    summary_csv_path = os.path.join(summary_folder, f'{i}.csv')
    summary_csv_data = pd.read_csv(summary_csv_path)
    summary_sentences = summary_csv_data['sentence'].tolist()

    # 将摘要句子连接成一个文本
    summary_text = " ".join(summary_sentences)

    # 计算信息覆盖率
    coverage = calculate_coverage(text, summary_text)
    total_coverage += coverage
    

    # 计算信息多样性
    diversity = calculate_diversity(summary_aspects)
    total_diversity += diversity

    num_files += 1

# 计算平均值
average_coverage = total_coverage / num_files
average_diversity = total_diversity / num_files

print(f'所有文件的平均信息覆盖率: {average_coverage}')
print(f'所有文件的平均信息多样性: {average_diversity}')