In [1]:
import polars as pl
import pickle

#### 1. 读取doc_info并删除其他行

In [2]:
data_path = "/data3/zxh/news_rec/raw_data"

In [3]:
doc_info = pl.read_ipc(f"{data_path}/doc_info.arrow").select(["article_id", "keywords"])
doc_info

article_id,keywords
i64,str
349635709,"""上班族:8.469502,买车:8.137443,二手车:9…"
361653323,"""医生:14.760494,吸烟:16.474872,板蓝根:…"
426732705,"""155n:8.979802,polo:7.951116,中控…"
430221183,"""etc:12.055207,代表:8.878175,内饰:5…"
441756326,"""丰田凯美瑞:12.772149,充电器:8.394001,品…"
…,…
467277215,"""人因:6.528161,健康:6.471880,儿童:7.5…"
467277413,"""体力:5.064578,吕钦:14.577604,唐丹:24…"
467277503,"""伊朗:7.489791,刷釉:18.557439,北宋:8.…"
467278115,"""cj:16.306715,三巨头:9.342195,争冠:8…"


#### 2. 查看相关信息

In [4]:
doc_info.describe()

statistic,article_id,keywords
str,f64,str
"""count""",633388.0,"""622407"""
"""null_count""",0.0,"""10981"""
"""mean""",464620000.0,
"""std""",1576900.0,
"""min""",325279629.0,"""#^^0000ff:10.539391,0000ff:10.…"
"""25%""",463448490.0,
"""50%""",464618782.0,
"""75%""",465814182.0,
"""max""",467278131.0,"""龙泉:20.366828"""


#### 3. 过滤掉null or 空字符串的情况

In [5]:
doc_info = doc_info.filter((pl.col("keywords").is_not_null()) & (pl.col("keywords") != ""))
doc_info

article_id,keywords
i64,str
349635709,"""上班族:8.469502,买车:8.137443,二手车:9…"
361653323,"""医生:14.760494,吸烟:16.474872,板蓝根:…"
426732705,"""155n:8.979802,polo:7.951116,中控…"
430221183,"""etc:12.055207,代表:8.878175,内饰:5…"
441756326,"""丰田凯美瑞:12.772149,充电器:8.394001,品…"
…,…
467277215,"""人因:6.528161,健康:6.471880,儿童:7.5…"
467277413,"""体力:5.064578,吕钦:14.577604,唐丹:24…"
467277503,"""伊朗:7.489791,刷釉:18.557439,北宋:8.…"
467278115,"""cj:16.306715,三巨头:9.342195,争冠:8…"


#### 4. 将文章ID和关键字的DataFrame转换为归一化字典

In [9]:
def build_normalized_dict(doc_info_df):
    """
    将包含文章ID和关键字的DataFrame转换为归一化字典
    参数：
        doc_info_df : pd.DataFrame，包含两列 [article_id, keywords]
    
    返回：
        dict，结构为 {article_id: {keyword: normalized_score, ...}, ...}
        set，所有出现过的关键字集合
        int，最多出现的关键字数
    """
    result = {}
    all_keywords = set()  # 用于存储所有出现过的关键字
    max_keywords_count = 0  # 记录每条记录中最多出现的关键字数

    for _, row in doc_info_df.iterrows():
        article_id = row['article_id']
        keywords_str = row['keywords'].strip('"')  # 移除首尾可能的引号
        
        # 分割键值对并转换数值
        pairs = []
        total = 0.0
        current_keywords_count = 0  # 记录当前记录中关键字的数量
        for pair in keywords_str.split(','):
            if ':' not in pair:
                continue
            
            key, value = pair.split(':', 1)  # 只分割第一个冒号
            try:
                num = float(value)
                pairs.append((key.strip(), num))
                total += num
                all_keywords.add(key.strip())  # 将关键字添加到集合中
                current_keywords_count += 1  # 增加当前记录中的关键字数
            except ValueError:
                continue
        
        # 更新最多出现的关键字数
        max_keywords_count = max(max_keywords_count, current_keywords_count)
        
        # 归一化处理（避免除零错误）
        normalized_dict = {}
        if total > 0:
            normalized_dict = {k: v/total for k, v in pairs}
        
        result[article_id] = normalized_dict

    return result, all_keywords, max_keywords_count  # 返回归一化字典、所有出现过的关键字集合和最多出现的关键字数

In [10]:
# 构建字典
normalized_dict, all_keywords_set, maxlen = build_normalized_dict(doc_info.to_pandas())

In [9]:
# 转换为 polars 的 DataFrame
def convert_to_polars(doc_keyword_dict):
    data = []
    for doc_id, keywords in doc_keyword_dict.items():
        # 转换 key-value 形式为 "关键词:权重"
        keywords_str = ",".join(f"{k}:{v:.6f}" for k, v in keywords.items())
        data.append((doc_id, keywords_str))
    
    # 创建 Polars DataFrame
    df = pl.DataFrame(data, schema=["article_id", "keywords"], orient="row")
    return df
doc_keywords = convert_to_polars(normalized_dict)
doc_keywords

article_id,keywords
i64,str
349635709,"""上班族:0.052498,买车:0.050440,二手车:0…"
361653323,"""医生:0.133734,吸烟:0.149266,板蓝根:0.…"
426732705,"""155n:0.033340,polo:0.029521,中控…"
430221183,"""etc:0.038040,代表:0.028015,内饰:0.…"
441756326,"""丰田凯美瑞:0.089051,充电器:0.058525,品牌…"
…,…
467277215,"""人因:0.021596,健康:0.021410,儿童:0.0…"
467277413,"""体力:0.017298,吕钦:0.049790,唐丹:0.0…"
467277503,"""伊朗:0.021034,刷釉:0.052117,北宋:0.0…"
467278115,"""cj:0.048329,三巨头:0.027688,争冠:0.…"


In [10]:
# 保存数据
public_path = "/data3/zxh/news_rec/public_data" 
doc_keywords.write_ipc(f"{public_path}/doc_keywords_feature.ipc")