In [18]:
import polars as pl
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from typing import List
import pandas as pd


### 1. 读取数据

In [19]:
offline_path = "/data3/zxh/news_rec/offline_data"
online_path = "/data3/zxh/news_rec/online_data"
raw_path = "/data3/zxh/news_rec/raw_data"

In [20]:
# 读取用户交互记录
train_data_offline = pl.read_ipc(f"{offline_path}/train_data_offline.ipc")
val_data_offline =  pl.read_ipc(f"{offline_path}/val_data_offline.ipc")
test_data_online = pl.read_ipc(f"{online_path}/test_data_online.ipc")

# 读取文章和用户信息
user_info = pl.read_ipc(f"{raw_path}/user_info.arrow")
doc_info = pl.read_ipc(f"{raw_path}/doc_info.arrow")

### 2. 合并数据

In [21]:
# 线下训练只统计train_data的统计特征，而线上测试的时候需要train_data + val_data
offline = True
if offline:
    train_data, val_data, test_data = train_data_offline, val_data_offline, test_data_online
else:
    train_data, test_data = pl.concat([train_data_offline, val_data_offline], how="vertical"), test_data_online

# 1. 合并数据
merged_df = train_data.join(user_info, on="user_id", how="left")
merged_df = merged_df.join(doc_info, on="article_id", how="left")
    
# 2. 删除无关特征
merged_df = merged_df.drop(
    ["network_env", "refresh_count", "expose_pos", "device_name", "os", "province", "city", "age", "gender", 
    "title", "publish_time", "image_count", "keywords", "expose_time", "duration", "refresh_count_transformed_box"]
)

### 2. 计算用户的各类统计指标

#### 2.1 计算用户的CTR

In [22]:
# 定义 Min-Max 标准化函数
def min_max_scale(df, columns, scaler=None):
    if scaler is None:
        scaler = MinMaxScaler()
        scaler.fit(df.select(columns).to_numpy())
    
    scaled_values = scaler.transform(df.select(columns).to_numpy())
    
    return df.with_columns([
        pl.Series(scaled_values[:, i]).alias(f"{col}")
        for i, col in enumerate(columns)
    ])


def compute_user_ctr(merged_data: pl.DataFrame, columns_to_scale: List, z: float = 1.96) -> pl.DataFrame:
    """
    计算用户的历史点击次数、曝光次数、CTR、Wilson 平滑 CTR，并对曝光次数和点击次数进行 Box-Cox 变换，同时对 Box-Cox 变换后的数据。
    
    参数：
    - merged_data: 合并后的数据，包含 user_id、article_id、is_clicked、duration_transformed_box 等列
    - columns_to_scale: 需要进行 Min - Max 归一化的列，这里为 ["userid_expose_count_transformed_box", "userid_history_count_transformed_box"]
    - z: Wilson平滑的置信度
    
    返回：
    - 处理后的用户 CTR DataFrame
    """
    
    # 1. 计算用户的曝光次数和点击次数
    user_ctr = (
        merged_data
        .group_by("user_id")
        .agg([
            pl.sum("is_clicked").alias("userid_history_count"),  # 总点击次数
            pl.count("is_clicked").alias("userid_expose_count")  # 总曝光次数
        ])
        .with_columns(
            (pl.col("userid_history_count") / pl.col("userid_expose_count")).alias("userid_ctr")  # 计算CTR
        )
    )
    
    # 2. 计算 Wilson 平滑 CTR
    user_ctr = user_ctr.with_columns([
        ((pl.col("userid_ctr") + (z**2) / (2 * pl.col("userid_expose_count")) -
         z * ((pl.col("userid_ctr") * (1 - pl.col("userid_ctr")) / pl.col("userid_expose_count") +
         (z**2) / (4 * pl.col("userid_expose_count")**2)).sqrt())) /
         (1 + (z**2) / pl.col("userid_expose_count"))).alias("userid_wilson_ctr")
    ])
    
    # 3. 对 expose_count 和 history_count 进行 Box-Cox 变换（加 1 以避免 0 值）
    user_ctr = user_ctr.with_columns(
        (pl.Series(boxcox(user_ctr["userid_expose_count"].to_numpy() + 1)[0])).alias("userid_expose_count_transformed_box")
    )

    user_ctr = user_ctr.with_columns(
        (pl.Series(boxcox(user_ctr["userid_history_count"].to_numpy() + 1)[0])).alias("userid_history_count_transformed_box")
    )

    # 4. 进行 Min - Max 归一化
    user_ctr = min_max_scale(user_ctr, columns_to_scale)
    
    return user_ctr


In [23]:
user_ctr = compute_user_ctr(merged_df, ["userid_expose_count_transformed_box", "userid_history_count_transformed_box"])


if offline:
    user_ctr.write_ipc(f"{offline_path}/user_ctr_offline.ipc")
else:
    user_ctr.write_ipc(f"{online_path}/user_ctr_online.ipc")

#### 2.2 计算用户对每个大类（category_level1）的统计信息

In [24]:
def compute_user_category1_stats(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算用户在 category_level1 上的统计信息，包括：
    - 点击次数（userid_category1_history_count）
    - 点击率（userid_category1_ctr）
    - 归一化的点击次数分桶和 One-Hot 编码
    - 平均阅读时长（userid_category1_history_duration_mean）

    参数：
    - merged_df: 包含用户点击数据的数据框，必须包含 'user_id'、'category_level1'、'is_clicked' 和 'duration_transformed_box' 列。

    返回：
    - pl.DataFrame: 用户在 category_level1 上的综合统计信息
    """

    # 1. 计算用户对每个大类（category_level1）的点击次数
    user_category1_count = (
        merged_df
        .filter(pl.col("is_clicked") == 1)  # 只统计点击的文章
        .group_by(["user_id", "category_level1"])
        .agg(pl.count("is_clicked").alias("userid_category1_history_count"))
    )

    # 2. 分桶
    percentiles = [1, 2, 4, user_category1_count["userid_category1_history_count"].max() + 1]

    def categorize_category1_count(ic): # 定义分桶函数
        if ic is None:
            return ""  # 空缺值归为 "" 类
        for i in range(len(percentiles) - 1):
            if percentiles[i] <= ic < percentiles[i + 1]:
                return str(i + 1)  # 类别从 1 开始编号
        return None  # 其他情况不会出现
    

    user_category1_count = user_category1_count.with_columns(
        pl.col("userid_category1_history_count").map_elements(categorize_category1_count, return_dtype=pl.Utf8)
    ).fill_null("")

    # 3. 计算用户在各个类别上的点击率（CTR）
    user_category1_ctr = (
        merged_df
        .group_by(["user_id", "category_level1"])
        .agg([
            pl.sum("is_clicked").alias("clicks"),
            pl.count("is_clicked").alias("exposes")
        ])
        .with_columns(
            (pl.col("clicks") / pl.col("exposes")).alias("userid_category1_ctr")  # 计算CTR
        )
        .select(["user_id", "category_level1", "userid_category1_ctr"])
    )

    # 4. 计算用户对 category_level1 的平均阅读时长
    user_category1_duration = (
        merged_df
        .filter(pl.col("is_clicked") == 1)  # 只统计点击的文章
        .group_by(["user_id", "category_level1"])
        .agg(pl.mean("duration_transformed_box").alias("userid_category1_history_duration_mean"))
    )

    # 5. 连接所有特征
    user_category1_stats = (
        user_category1_count
        .join(user_category1_ctr, on=["user_id", "category_level1"], how="left")
        .join(user_category1_duration, on=["user_id", "category_level1"], how="left")
    )

    return user_category1_stats

In [25]:
user_category1_result = compute_user_category1_stats(merged_df.fill_null(""))
user_category1_result.describe()

statistic,user_id,category_level1,userid_category1_history_count,userid_category1_ctr,userid_category1_history_duration_mean
str,f64,str,str,f64,f64
"""count""",5223291.0,"""5223291""","""5223291""",5223291.0,5223291.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0
"""mean""",1905700000.0,,,0.341583,0.337289
"""std""",545660000.0,,,0.290521,0.114177
"""min""",17340.0,"""""","""1""",0.000763,0.029607
"""25%""",1476400000.0,,,0.125,0.266121
"""50%""",2208100000.0,,,0.25,0.34075
"""75%""",2396900000.0,,,0.5,0.411686
"""max""",2447100000.0,"""音乐""","""3""",1.0,0.998981


In [26]:
# 存储数据
if offline:
    user_category1_result.write_ipc(f"{offline_path}/user_category1_stats_offline.ipc")
else:
    user_category1_result.write_ipc(f"{online_path}/user_category1_stats_online.ipc")

#### 2.3 计算用户对每个小类（category_level2）的统计信息

In [27]:
def compute_user_category2_stats(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算用户在 category_level2 上的统计信息，包括：
    - 点击次数（userid_category2_history_count）
    - 点击率（userid_category2_ctr）
    - 归一化的点击次数分桶和 One-Hot 编码
    - 平均阅读时长（userid_category2_history_duration_mean）

    参数：
    - merged_df: 包含用户点击数据的数据框，必须包含 'user_id'、'category_level2'、'is_clicked' 和 'duration_transformed_box' 列。

    返回：
    - pl.DataFrame: 用户在 category_level2 上的综合统计信息
    """

    # 1. 计算用户对每个大类（category_level2）的点击次数
    user_category2_count = (
        merged_df
        .filter(pl.col("is_clicked") == 1)  # 只统计点击的文章
        .group_by(["user_id", "category_level2"])
        .agg(pl.count("is_clicked").alias("userid_category2_history_count"))
    )

    # 2. 分桶
    percentiles = [1, 2, 4, user_category2_count["userid_category2_history_count"].max() + 1]

    def categorize_category2_count(ic): # 定义分桶函数
        if ic is None:
            return ""  # 空缺值归为 "" 类
        for i in range(len(percentiles) - 1):
            if percentiles[i] <= ic < percentiles[i + 1]:
                return str(i + 1)  # 类别从 1 开始编号
        return None  # 其他情况不会出现
    

    user_category2_count = user_category2_count.with_columns(
        pl.col("userid_category2_history_count").map_elements(categorize_category2_count, return_dtype=pl.Utf8)
    ).fill_null("")


    # 3. 计算用户在各个类别上的点击率（CTR）
    user_category2_ctr = (
        merged_df
        .group_by(["user_id", "category_level2"])
        .agg([
            pl.sum("is_clicked").alias("clicks"),
            pl.count("is_clicked").alias("exposes")
        ])
        .with_columns(
            (pl.col("clicks") / pl.col("exposes")).alias("userid_category2_ctr")  # 计算CTR
        )
        .select(["user_id", "category_level2", "userid_category2_ctr"])
    )

    # 4. 计算用户对 category_level2 的平均阅读时长
    user_category2_duration = (
        merged_df
        .filter(pl.col("is_clicked") == 1)  # 只统计点击的文章
        .group_by(["user_id", "category_level2"])
        .agg(pl.mean("duration_transformed_box").alias("userid_category2_history_duration_mean"))
    )

    # 5. 连接所有特征
    user_category2_stats = (
        user_category2_count
        .join(user_category2_ctr, on=["user_id", "category_level2"], how="left")
        .join(user_category2_duration, on=["user_id", "category_level2"], how="left")
    )

    return user_category2_stats

In [28]:
user_category2_result = compute_user_category2_stats(merged_df.fill_null(""))
user_category2_result.describe()

statistic,user_id,category_level2,userid_category2_history_count,userid_category2_ctr,userid_category2_history_duration_mean
str,f64,str,str,f64,f64
"""count""",7684950.0,"""7684950""","""7684950""",7684950.0,7684950.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0
"""mean""",1893700000.0,,,0.42394,0.335328
"""std""",554860000.0,,,0.323368,0.118182
"""min""",17340.0,"""""","""1""",0.001374,0.029607
"""25%""",1466500000.0,,,0.166667,0.260483
"""50%""",2207000000.0,,,0.333333,0.339603
"""75%""",2395000000.0,,,0.5,0.41314
"""max""",2447100000.0,"""音乐/港台音乐""","""3""",1.0,0.998981


In [29]:
# 存储数据
if offline:
    user_category2_result.write_ipc(f"{offline_path}/user_category2_stats_offline.ipc")
else:
    user_category2_result.write_ipc(f"{online_path}/user_category2_stats_online.ipc")

#### 2.4 计算用户平均阅读时长

In [30]:
# **计算用户平均阅读时长**
def compute_user_duration_mean(merged_df: pl.DataFrame) -> pl.DataFrame:
    """
    计算用户的平均阅读时长
    """
    return (
        merged_df.
        filter(pl.col("is_clicked") == 1)
        .group_by("user_id")
        .agg(pl.mean("duration_transformed_box").alias("userid_history_duration_mean"))
    )


In [31]:
user_duration_mean = compute_user_duration_mean(merged_df.fill_null(""))
if offline:
    user_duration_mean.write_ipc(f"{offline_path}/user_duration_mean_offline.ipc")
else:
    user_duration_mean.write_ipc(f"{online_path}/user_duration_mean_online.ipc")