In [1]:
import time
import random
import pandas as pd
from content_cleaning import *
from content_scoring import *

2025-04-14 15:15:25,497 - datasets - INFO - PyTorch version 2.3.1+cu118 available.


In [5]:
# 初始化处理器
processor = XHSMixedLanguageProcessor(cache_size=2000, max_workers=12)
# Load the cooked contents data
cont = pd.read_json('..\..\Data\processed\contents_cooked.json')

In [None]:
# combine the title and note_body into a single string
def process_text(note):
    return note['title'] + ' ' + note['note_body']

# Apply the function to the DF
cont['text'] = cont.apply(process_text, axis=1).astype(str)

# Apply the batch processing function to the DF.text column
start_time = time.time()
processed_texts = processor.batch_process(cont['text'].tolist(), enable_translation=True)
end_time = time.time()

print(f"批量处理耗时: {end_time - start_time:.2f}秒")

# save the processed texts to a new column in the DF
cont['semantic_proc_text'] = processed_texts
# remove the original text column
cont.drop(columns=['text'], inplace=True)
# save the processed DF to a new JSON file
cont.to_json('..\..\Data\processed\contents_cooked_semantic.json', orient='records', lines=True, force_ascii=False)

In [2]:
# 读取处理后的数据
cont_semantic = pd.read_json('..\..\Data\processed\contents_cooked_semantic.json', lines=True)
# 设置参数
output_path = '../../Data/processed/dim3_scores.json'
plots_dir = '../../Figs/dim3_plots'
n_workers = 8  # 处理器的工作线程数
max_notes = 1000  # 每位用户处理的最大笔记数量

In [3]:
# 1. 加载模型
print("加载模型...")
models = load_models()

# 2. 预处理数据
print("预处理数据...")
df, user_notes, dim3_score = preprocess_data(cont_semantic)

# 3. 计算原创性评分
print("计算创作者原创性评分...")
dim3_score = calculate_creator_originality(df, dim3_score, models, n_workers=n_workers)

# 4. 计算垂直领域评分
print("计算垂直领域评分...")
dim3_score = calculate_vertical_score(df, dim3_score, VERTICAL_DOMAIN_TAGS, n_workers=n_workers)

# 5. 计算情感评分
print("计算情感评分...")
dim3_score = calculate_sentiment_score(df, dim3_score, models, max_notes_per_user=max_notes, n_workers=n_workers)

# 6. 计算关键词覆盖评分
print("计算关键词覆盖评分...")
dim3_score = calculate_keyword_score(df, dim3_score, CORE_KEYWORDS, n_workers=n_workers)

# 7. 计算内容质量总分
print("计算内容质量总分...")
dim3_score['score3_content_quality'] = dim3_score[['score3a_originality', 'score3b_vertical', 
                                                'score3c_sentiment', 'score3d_keyword']].sum(axis=1)

# 8. 生成分数分布图
print("生成分数分布图...")
plot_score_distributions(dim3_score, plots_dir)

# 9. 保存结果
print(f"保存结果到 {output_path}...")
dim3_score.to_json(output_path, orient='records', lines=True)

# 10. 显示分数统计信息
print("分数统计信息:")
for col in ['score3a_originality', 'score3b_vertical', 'score3c_sentiment', 'score3d_keyword', 'score3_content_quality']:
    print(f"\n{col}:")
    print(dim3_score[col].describe())

# 11. 显示最高和最低得分的用户
print("\n得分最高的10个用户:")
print(dim3_score.sort_values('score3_content_quality', ascending=False).head(10)[['user_id', 'score3_content_quality']])

print("\n得分最低的10个用户:")
print(dim3_score.sort_values('score3_content_quality').head(10)[['user_id', 'score3_content_quality']])

# 12. 将分数结果合并回原始数据
merged_data = pd.merge(cont_semantic, 
                      dim3_score[['user_id', 'score3_content_quality']], 
                      on='user_id', 
                      how='left')

# 查看合并后的数据
print(f"\n合并后的数据形状: {merged_data.shape}")

加载模型...


2025-04-14 15:15:27,509 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
2025-04-14 15:15:30,740 - content_scoring - INFO -   - 计算创作者原创性...
2025-04-14 15:15:30,741 - content_scoring - INFO -   - 使用 6 个CPU进程进行计算
2025-04-14 15:15:30,763 - content_scoring - INFO -   - 计算创作者内部多样性...


预处理数据...
计算创作者原创性评分...


内部多样性计算: 100%|██████████| 102/102 [00:00<00:00, 101958.77it/s]
2025-04-14 15:58:37,715 - content_scoring - INFO -   - 计算创作者之间的原创性差异...
2025-04-14 15:58:38,410 - content_scoring - INFO -   - 对代表性文本进行语义编码...
编码代表性文本: 100%|██████████| 83/83 [21:52<00:00, 15.82s/it] 
2025-04-14 16:20:36,945 - content_scoring - INFO -   - 组合评分并应用分布校正...
  dim3_score.loc[idx, 'score3a_originality'] = score
2025-04-14 16:20:37,030 - content_scoring - INFO -   - 计算垂直领域分数... (目标标签: {'小吃', '火锅', '外卖', '奶茶', '探店', '中餐', '甜品', 'food', '美食'})
2025-04-14 16:20:37,031 - content_scoring - INFO -   - 使用 6 个CPU进程进行计算


计算垂直领域评分...


处理标签: 100%|██████████| 80/80 [00:00<00:00, 184061.61it/s]
2025-04-14 16:20:37,501 - content_scoring - INFO -   - Sigmoid转换后分数: 均值=11.83, 标准差=6.45
2025-04-14 16:20:37,501 - content_scoring - INFO -   - 最终垂直领域分数: 均值=11.64, 标准差=6.75
2025-04-14 16:20:37,502 - content_scoring - INFO -   - 分数分布:
2025-04-14 16:20:37,502 - content_scoring - INFO -     0-5: 20 人
2025-04-14 16:20:37,503 - content_scoring - INFO -     5-10: 31 人
2025-04-14 16:20:37,504 - content_scoring - INFO -     10-15: 15 人
2025-04-14 16:20:37,505 - content_scoring - INFO -     15-20: 23 人
2025-04-14 16:20:37,505 - content_scoring - INFO -     20-25: 17 人
  dim3_score.loc[idx, 'score3b_vertical'] = score
2025-04-14 16:20:37,591 - content_scoring - INFO -   - 计算情感强度得分...
2025-04-14 16:20:37,592 - content_scoring - INFO -   - 每个用户最多处理 1000 篇笔记...
2025-04-14 16:20:37,592 - content_scoring - INFO -   - 使用 6 个CPU进程进行计算
2025-04-14 16:20:37,593 - content_scoring - INFO -   - 为每位创作者选择笔记...
2025-04-14 16:20:37,638 - content_scoring - 

计算情感评分...


2025-04-14 16:20:37,857 - content_scoring - INFO -   - 情感分析使用 4 个CPU工作进程处理
处理情感批次:   0%|          | 0/1987 [00:00<?, ?it/s]2025-04-14 16:20:37,909 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,910 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,911 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,912 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,913 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,914 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,914 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,915 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,915 - content_scoring - ERROR - 处理情感批次出错: name 'models' is not defined
2025-04-14 16:20:37,916 - content_scoring - ERROR - 处理情感批次

计算关键词覆盖评分...


关键词匹配: 100%|██████████| 32/32 [00:00<00:00, 32063.48it/s]
2025-04-14 16:20:41,113 - content_scoring - INFO -   - 应用时间衰减系数...
2025-04-14 16:20:41,117 - content_scoring - INFO -   - 聚合用户级关键词得分...
2025-04-14 16:20:41,161 - content_scoring - INFO -   - Sigmoid转换后分数: 均值=7.69, 标准差=3.78
2025-04-14 16:20:41,162 - content_scoring - INFO -   - 最终关键词分数: 均值=7.02, 标准差=4.53
2025-04-14 16:20:41,163 - content_scoring - INFO -   - 分数分布:
2025-04-14 16:20:41,164 - content_scoring - INFO -     0-5: 30 人
2025-04-14 16:20:41,164 - content_scoring - INFO -     5-10: 55 人
2025-04-14 16:20:41,165 - content_scoring - INFO -     10-15: 14 人
2025-04-14 16:20:41,166 - content_scoring - INFO -     15-20: 7 人
2025-04-14 16:20:41,166 - content_scoring - INFO -     20-25: 0 人
  dim3_score.loc[idx, 'score3d_keyword'] = score


计算内容质量总分...
生成分数分布图...


2025-04-14 16:20:42,654 - content_scoring - INFO - 分数分布图保存到 ../../Figs/dim3_plots


保存结果到 ../../Data/processed/dim3_scores.json...
分数统计信息:

score3a_originality:
count    106.000000
mean      17.132871
std        4.578355
min        5.284921
25%       13.832813
50%       17.710605
75%       20.373373
max       24.000000
Name: score3a_originality, dtype: float64

score3b_vertical:
count    106.000000
mean      11.635029
std        6.786750
min        0.000000
25%        5.835521
50%       10.877450
75%       18.394019
max       23.760039
Name: score3b_vertical, dtype: float64

score3c_sentiment:
count    1.060000e+02
mean     2.980073e+00
std      4.461989e-16
min      2.980073e+00
25%      2.980073e+00
50%      2.980073e+00
75%      2.980073e+00
max      2.980073e+00
Name: score3c_sentiment, dtype: float64

score3d_keyword:
count    106.000000
mean       7.018374
std        4.548398
min        0.000000
25%        4.745508
50%        6.280316
75%        8.641144
max       19.135805
Name: score3d_keyword, dtype: float64

score3_content_quality:
count    106.000000
mean  