### 处理物料离散特征

#### 1. 读取数据

In [28]:
import polars as pl
import pandas as pd
data_path = "/data3/zxh/news_rec/raw_data"
doc_info = pl.read_ipc(f"{data_path}/doc_info.arrow").drop(["title", "keywords"])
doc_info

article_id,publish_time,image_count,category_level1,category_level2
i64,i64,i64,str,str
349635709,1572519971000,9,"""汽车""","""汽车/用车"""
361653323,1624522285000,1,"""健康""","""健康/疾病防护治疗及西医用药"""
426732705,1610808303000,9,"""汽车""","""汽车/买车"""
430221183,1612581556000,2,"""汽车""","""汽车/买车"""
441756326,1618825835000,23,"""汽车""","""汽车/买车"""
…,…,…,…,…
467277215,1625667021000,8,"""生活""","""生活/家居"""
467277413,1625668717000,2,"""体育""","""体育/棋牌"""
467277503,1625663360000,7,"""文化艺术""","""文化艺术/文玩收藏"""
467278115,1625672111000,10,"""体育""","""体育/NBA"""


In [29]:
doc_info.describe()

statistic,article_id,publish_time,image_count,category_level1,category_level2
str,f64,f64,f64,str,str
"""count""",633388.0,633146.0,633146.0,"""633101""","""633100"""
"""null_count""",0.0,242.0,242.0,"""287""","""288"""
"""mean""",464620000.0,1625000000000.0,5.907097,,
"""std""",1576900.0,448080000.0,7.068902,,
"""min""",325279629.0,1563400000000.0,0.0,"""两性""","""两性/两性健康"""
"""25%""",463448490.0,1624700000000.0,2.0,,
"""50%""",464618782.0,1625000000000.0,4.0,,
"""75%""",465814182.0,1625300000000.0,8.0,,
"""max""",467278131.0,1625700000000.0,194.0,"""颜值才艺""","""颜值才艺/男神"""


#### 2. 对图片数量进行分桶

In [30]:
def bucketize_image_count(doc_info: pl.DataFrame) -> pl.DataFrame:
    """
    对 `image_count` 进行分桶，并替换为对应类别（"0" 类、"1" 类、"2" 类...），不进行 One-Hot 编码。

    参数：
    - doc_info: 包含文章信息的数据框，必须包含 'image_count' 列。

    返回：
    - pl.DataFrame: `image_count` 经过类别转换后的数据框
    """
    # 过滤掉 None 值，仅计算非空数据的分位数
    non_null_values = doc_info.filter(pl.col("image_count").is_not_null())["image_count"].to_pandas()

    # 计算 0%、25%、50%、75% 分位数，并去重排序，最后加上最大值+1 确保完整区间
    percentiles = sorted(set(non_null_values.quantile([0, 0.25, 0.5, 0.75]).tolist())) + [non_null_values.max() + 1]

    # 定义分桶函数
    def categorize_image_count(ic):
        if ic is None:
            return ""  # 空缺值归为 "" 类
        for i in range(len(percentiles) - 1):
            if percentiles[i] <= ic < percentiles[i + 1]:
                return str(i + 1)  # 类别从 1 开始编号
        return None  # 其他情况不会出现

    # 应用分桶转换 `image_count`
    doc_info = doc_info.with_columns(
        pl.col("image_count").map_elements(categorize_image_count, return_dtype=pl.Utf8)
    ).fill_null("")

    return doc_info

In [31]:
# 调用函数对 image_count 进行分桶
doc_info = bucketize_image_count(doc_info)

#### 3. 保存数据

In [32]:
doc_info.describe()

statistic,article_id,publish_time,image_count,category_level1,category_level2
str,f64,f64,str,str,str
"""count""",633388.0,633146.0,"""633388""","""633388""","""633388"""
"""null_count""",0.0,242.0,"""0""","""0""","""0"""
"""mean""",464620000.0,1625000000000.0,,,
"""std""",1576900.0,448080000.0,,,
"""min""",325279629.0,1563400000000.0,"""""","""""",""""""
"""25%""",463448490.0,1624700000000.0,,,
"""50%""",464618782.0,1625000000000.0,,,
"""75%""",465814182.0,1625300000000.0,,,
"""max""",467278131.0,1625700000000.0,"""4""","""颜值才艺""","""颜值才艺/男神"""


In [33]:
doc_info

article_id,publish_time,image_count,category_level1,category_level2
i64,i64,str,str,str
349635709,1572519971000,"""4""","""汽车""","""汽车/用车"""
361653323,1624522285000,"""1""","""健康""","""健康/疾病防护治疗及西医用药"""
426732705,1610808303000,"""4""","""汽车""","""汽车/买车"""
430221183,1612581556000,"""2""","""汽车""","""汽车/买车"""
441756326,1618825835000,"""4""","""汽车""","""汽车/买车"""
…,…,…,…,…
467277215,1625667021000,"""4""","""生活""","""生活/家居"""
467277413,1625668717000,"""2""","""体育""","""体育/棋牌"""
467277503,1625663360000,"""3""","""文化艺术""","""文化艺术/文玩收藏"""
467278115,1625672111000,"""4""","""体育""","""体育/NBA"""


In [34]:
# 保存数据
public_path = "/data3/zxh/news_rec/public_data" 
doc_info.write_ipc(f"{public_path}/doc_sparse_feature.ipc")