In [17]:
import polars as pl
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from typing import List
import pandas as pd

### 1. 读取数据

In [18]:
offline_path = "/data3/zxh/news_rec/offline_data"
online_path = "/data3/zxh/news_rec/online_data"
raw_path = "/data3/zxh/news_rec/raw_data"

In [19]:
# 读取用户交互记录
train_data_offline = pl.read_ipc(f"{offline_path}/train_data_offline.ipc")
val_data_offline =  pl.read_ipc(f"{offline_path}/val_data_offline.ipc")
test_data_online = pl.read_ipc(f"{online_path}/test_data_online.ipc")

# 读取文章和用户信息
user_info = pl.read_ipc(f"{raw_path}/user_info.arrow")
doc_info = pl.read_ipc(f"{raw_path}/doc_info.arrow")

### 2. 合并数据

In [20]:
# 线下训练只统计train_data的统计特征，而线上测试的时候需要train_data + val_data
offline = False
if offline:
    train_data, val_data, test_data = train_data_offline, val_data_offline, test_data_online
else:
    train_data, test_data = pl.concat([train_data_offline, val_data_offline], how="vertical"), test_data_online

# 1. 合并数据
merged_df = train_data.join(user_info, on="user_id", how="left")
merged_df = merged_df.join(doc_info, on="article_id", how="left")

# 2. 删除无关特征
merged_df = merged_df.drop(
    ["expose_time", "network_env", "duration", "refresh_count", "refresh_count_transformed_box", 
    "expose_pos","device_name","os","province","city","age","gender","title","publish_time","image_count","keywords"]
)


### 3. 计算基于 docid（文章）的点击率

In [21]:
def min_max_scale(df: pl.DataFrame, columns: List[str], scaler=None) -> pl.DataFrame:
    """
    Min-Max 标准化函数
    """
    if scaler is None:
        scaler = MinMaxScaler()
        scaler.fit(df.select(columns).to_numpy())

    scaled_values = scaler.transform(df.select(columns).to_numpy())

    return df.with_columns([
        pl.Series(scaled_values[:, i]).alias(f"{col}")
        for i, col in enumerate(columns)
    ])

def compute_doc_ctr(merged_df: pl.DataFrame, columns_to_scale: List[str], z: float = 1.96) -> pl.DataFrame:
    """
    计算文章的点击次数、曝光次数、CTR、Wilson 平滑 CTR，并对曝光次数和点击次数进行 Box-Cox 变换，同时进行 Min-Max 归一化。

    参数：
    - merged_df: 合并后的数据，包含 article_id、is_clicked 等列
    - columns_to_scale: 需要进行 Min-Max 归一化的列
    - z: Wilson 平滑的置信度（默认 1.96，95% 置信区间）

    返回：
    - 处理后的文章 CTR DataFrame
    """
    
    # 1. 计算文章的曝光次数和点击次数
    doc_ctr = (
        merged_df
        .group_by("article_id")
        .agg([
            pl.sum("is_clicked").alias("docid_history_count"),  # 文章总点击次数
            pl.count("is_clicked").alias("docid_expose_count")  # 文章总曝光次数
        ])
        .with_columns(
            (pl.col("docid_history_count") / pl.col("docid_expose_count")).alias("docid_ctr")  # 计算CTR
        )
    )

    # 2. 计算 Wilson 平滑 CTR
    doc_ctr = doc_ctr.with_columns([
        ((pl.col("docid_ctr") + (z**2) / (2 * pl.col("docid_expose_count")) -
         z * ((pl.col("docid_ctr") * (1 - pl.col("docid_ctr")) / pl.col("docid_expose_count") +
         (z**2) / (4 * pl.col("docid_expose_count")**2)).sqrt())) /
         (1 + (z**2) / pl.col("docid_expose_count"))).alias("docid_wilson_ctr")
    ])

    # 3. 对 docid_expose_count 和 docid_history_count 进行 Box-Cox 变换（加 1 以避免 0 值）
    doc_ctr = doc_ctr.with_columns(
        (pl.Series(boxcox(doc_ctr["docid_expose_count"].to_numpy() + 1)[0])).alias("docid_expose_count_transformed_box")
    )

    doc_ctr = doc_ctr.with_columns(
        (pl.Series(boxcox(doc_ctr["docid_history_count"].to_numpy() + 1)[0])).alias("docid_history_count_transformed_box")
    )

    # 4. 进行 Min - Max 归一化
    doc_ctr = min_max_scale(doc_ctr, columns_to_scale)

    return doc_ctr

In [22]:
doc_ctr = compute_doc_ctr(merged_df, ["docid_expose_count_transformed_box", "docid_history_count_transformed_box"])

if offline:
    doc_ctr.write_ipc(f"{offline_path}/doc_ctr_offline.ipc")
else:
    doc_ctr.write_ipc(f"{online_path}/doc_ctr_online.ipc")

### 4. **计算基于 category1（一级分类）的历史点击次数、ctr和热度**

In [23]:
def compute_category1_stats(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算一级分类的点击次数、CTR、平均停留时长，并归一化点击次数为热度。

    参数：
    - merged_df: 合并后的数据，包含 category_level1、is_clicked 和 duration 列。

    返回：
    - 处理后的 category1 统计 DataFrame
    """
    
    # 1. 计算 category1 的点击总次数和 CTR
    category1_stats = merged_df.group_by("category_level1").agg([
        pl.col("is_clicked").sum().alias("category1_history_count"),  # 一级分类点击总次数
        (pl.col("is_clicked").sum() / pl.len()).alias("category1_ctr"),  # 点击率
    ])

    # 2. 归一化 category1_history_count 作为热度
    category1_stats = category1_stats.with_columns(
        (pl.col("category1_history_count") / pl.col("category1_history_count").sum()).alias("category1_popularity")
    ).fill_null("")

    # 3. 计算 category1 的平均停留时长（仅统计点击过的记录）
    category1_duration_mean = merged_df.filter(pl.col("is_clicked") == 1).group_by("category_level1").agg(
        pl.col("duration_transformed_box").mean().alias("category1_history_duration_mean")
    ).fill_null("") # 缺失的类别统一用""来代替

    # 4. 连接 category1_stats 和 category1_duration_mean
    category1_stats = category1_stats.join(
        category1_duration_mean,  # 右表
        on="category_level1",     # 连接键
        how="left"                # 左连接，保留所有 category1_stats 里的类别
    )

    # 5. 按热度降序排列
    category1_stats = category1_stats.sort("category1_popularity", descending=True)

    return category1_stats

In [24]:
category1_result = compute_category1_stats(merged_df)

# 存储数据
if offline:
    category1_result.write_ipc(f"{offline_path}/doc_category1_stats_offline.ipc")
else:
    category1_result.write_ipc(f"{online_path}/doc_category1_stats_online.ipc")

### 5. **计算基于 category2（二级分类）的历史点击次数、ctr和热度**

In [25]:
def compute_category2_stats(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算一级分类的点击次数、CTR、平均停留时长，并归一化点击次数为热度。

    参数：
    - merged_df: 合并后的数据，包含 category_level2、is_clicked 和 duration 列。

    返回：
    - 处理后的 category2 统计 DataFrame
    """
    
    # 1. 计算 category2 的点击总次数和 CTR
    category2_stats = merged_df.group_by("category_level2").agg([
        pl.col("is_clicked").sum().alias("category2_history_count"),  # 二级分类点击总次数
        (pl.col("is_clicked").sum() / pl.len()).alias("category2_ctr"),  # 点击率
    ])

    # 2. 归一化 category2_history_count 作为热度
    category2_stats = category2_stats.with_columns(
        (pl.col("category2_history_count") / pl.col("category2_history_count").sum()).alias("category2_popularity")
    ).fill_null("")

    # 3. 计算 category2 的平均停留时长（仅统计点击过的记录）
    category2_duration_mean = merged_df.filter(pl.col("is_clicked") == 1).group_by("category_level2").agg(
        pl.col("duration_transformed_box").mean().alias("category2_history_duration_mean")
    ).fill_null("") # 缺失的类别统一用""来代替

    # 4. 连接 category2_stats 和 category2_duration_mean
    category2_stats = category2_stats.join(
        category2_duration_mean,  # 右表
        on="category_level2",     # 连接键
        how="left"                # 左连接，保留所有 category2_stats 里的类别
    )

    # 5. 按热度降序排列
    category2_stats = category2_stats.sort("category2_popularity", descending=True)

    return category2_stats

In [26]:
category2_result = compute_category2_stats(merged_df)

# 存储数据
if offline:
    category2_result.write_ipc(f"{offline_path}/doc_category2_stats_offline.ipc")
else:
    category2_result.write_ipc(f"{online_path}/doc_category2_stats_online.ipc")

In [27]:
category2_result

category_level2,category2_history_count,category2_ctr,category2_popularity,category2_history_duration_mean
str,i64,f64,f64,f64
"""情感/婚姻与家庭""",2059011,0.155137,0.082589,0.378579
"""娱乐/内地明星""",1925265,0.135733,0.077224,0.343158
"""军事/军事新闻""",1015236,0.171002,0.040722,0.343749
"""健康/疾病防护治疗及西医用药""",969574,0.12753,0.038891,0.329706
"""星座运势/风水与算命""",910410,0.200625,0.036517,0.251325
…,…,…,…,…
"""汽车/其他""",0,0.0,0.0,
"""颜值才艺/男神""",0,0.0,0.0,
"""音乐/日本音乐""",0,0.0,0.0,
"""搞笑/小品""",0,0.0,0.0,


### 6. 计算文章的平均阅读时长

In [28]:
# **计算文章的平均阅读时长**
def compute_doc_duration_mean(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算文章的的平均阅读时长
    """
    return (
        merged_df.
        filter(pl.col("is_clicked") == 1)
        .group_by("article_id")
        .agg(pl.mean("duration_transformed_box").alias("docid_history_duration_mean"))
    )

In [29]:
doc_duration_mean = compute_doc_duration_mean(merged_df.fill_null(""))
if offline:
    doc_duration_mean.write_ipc(f"{offline_path}/doc_duration_mean_offline.ipc")
else:
    doc_duration_mean.write_ipc(f"{online_path}/doc_duration_mean_online.ipc")