In [1]:
import polars as pl
import pandas as pd

### 1. 读取数据

In [2]:
temp_path = "/data3/zxh/news_rec/temp_results"

In [3]:
train_data_offline = pl.read_ipc(f"{temp_path}/train_data_offline.ipc")
val_data_offline =  pl.read_ipc(f"{temp_path}/val_data_offline.ipc")
test_data_online = pl.read_ipc(f"{temp_path}/test_data_online.ipc")

### 2. 处理duration的异常值

#### 2.1 处理duration为619315213的异常值

In [7]:
# 发现619315213出现在val_data_offline中
val_data_offline["duration"].describe()

statistic,value
str,f64
"""count""",15893872.0
"""null_count""",7739.0
"""mean""",60.670086
"""std""",155344.878725
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",0.0
"""max""",619315213.0


In [8]:
# 使用train_data_offline的均值进行替换
mean_duration = (
    train_data_offline
    .filter(
        (pl.col("article_id") == 466400260) & 
        (pl.col("is_clicked") == 1) & 
        (pl.col("duration") != 619315213)  # 先去掉异常值
    ).select(pl.col("duration"))
    .mean()
    .item()
)

# 用计算出的平均值替换异常值 619315213
val_data_offline = val_data_offline.with_columns(
    pl.when((pl.col("article_id") == 466400260) & (pl.col("duration") == 619315213))
    .then(mean_duration)
    .otherwise(pl.col("duration"))
    .alias("duration")
)
val_data_offline["duration"].describe()

statistic,value
str,f64
"""count""",15893871.0
"""null_count""",7740.0
"""mean""",21.704428
"""std""",83.623885
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",0.0
"""max""",3505.0


#### 2.2 处理duration为null的情况

In [9]:
# 计算各个 article_id 上的平均值填充为null的duration
article_avg_duration = (
    train_data_offline.filter((pl.col("duration").is_not_null()) & (pl.col("is_clicked") == 1))  # 先去掉NULL值
    .group_by("article_id")
    .agg(pl.col("duration").mean().round().alias("avg_duration"))  # 计算均值并取整
)
article_avg_duration

article_id,avg_duration
i64,f64
462916213,108.0
465425345,5.0
463191874,128.0
463827349,189.0
465881769,153.0
…,…
463701770,102.0
464723536,308.0
464882279,84.0
463823675,146.0


In [10]:
# 进行 left join，将从训练集中计算出的平均消费时长映射到各个df上面
train_data_offline = train_data_offline.join(article_avg_duration, on="article_id", how="left")
val_data_offline = val_data_offline.join(article_avg_duration, on="article_id", how="left")
test_data_online = test_data_online.join(article_avg_duration, on="article_id", how="left")

In [11]:
# 替换 duration 为对应 article_id 的平均消费时长
train_data_offline = train_data_offline.with_columns(
    pl.when(pl.col("duration").is_null())
    .then(pl.col("avg_duration"))
    .otherwise(pl.col("duration"))
    .alias("duration")
)

val_data_offline = val_data_offline.with_columns(
    pl.when(pl.col("duration").is_null())
    .then(pl.col("avg_duration"))
    .otherwise(pl.col("duration"))
    .alias("duration")
)

test_data_online = test_data_online.with_columns(
    pl.when(pl.col("duration").is_null())
    .then(pl.col("avg_duration"))
    .otherwise(pl.col("duration"))
    .alias("duration")
)

print(train_data_offline.describe())
print(val_data_offline.describe())
print(test_data_online.describe())

shape: (9, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ user_id   ┆ article_i ┆ expose_ti ┆ … ┆ is_clicke ┆ duration  ┆ date      ┆ avg_dura │
│ ---       ┆ ---       ┆ d         ┆ me        ┆   ┆ d         ┆ ---       ┆ ---       ┆ tion     │
│ str       ┆ f64       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ f64       ┆ str       ┆ ---      │
│           ┆           ┆ f64       ┆ f64       ┆   ┆ f64       ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 1.6105985 ┆ 1.6105985 ┆ 1.6105985 ┆ … ┆ 1.6105985 ┆ 1.6105899 ┆ 161059858 ┆ 1.596976 │
│           ┆ 8e8       ┆ 8e8       ┆ 8e8       ┆   ┆ 8e8       ┆ 6e8       ┆           ┆ 43e8     │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 862.0     ┆ 0         ┆ 1.362215 │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           

In [12]:
# 计算 is_clicked 为 1 的 duration 的全局平均值(因为剩下的文章是没有历史均值的数据的)
mean_duration = train_data_offline.filter(pl.col("is_clicked") == 1)["duration"].mean()

# 填充null值
train_data_offline = train_data_offline.with_columns(
    pl.when(pl.col("duration").is_null())
      .then(mean_duration)
      .otherwise(pl.col("duration"))
      .alias("duration")
).drop(["date", "avg_duration"])


val_data_offline = val_data_offline.with_columns(
    pl.when(pl.col("duration").is_null())
      .then(mean_duration)
      .otherwise(pl.col("duration"))
      .alias("duration")
).drop(["date", "avg_duration"])


test_data_online = test_data_online.with_columns(
    pl.when(pl.col("duration").is_null())
      .then(mean_duration)
      .otherwise(pl.col("duration"))
      .alias("duration")
).drop(["date", "avg_duration"])

print(train_data_offline.describe())
print(val_data_offline.describe())
print(test_data_online.describe())

shape: (9, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ user_id   ┆ article_i ┆ expose_ti ┆ … ┆ refresh_c ┆ expose_po ┆ is_clicke ┆ duration │
│ ---       ┆ ---       ┆ d         ┆ me        ┆   ┆ ount      ┆ s         ┆ d         ┆ ---      │
│ str       ┆ f64       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ f64      │
│           ┆           ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 1.6105985 ┆ 1.6105985 ┆ 1.6105985 ┆ … ┆ 1.6105985 ┆ 1.6105985 ┆ 1.6105985 ┆ 1.610598 │
│           ┆ 8e8       ┆ 8e8       ┆ 8e8       ┆   ┆ 8e8       ┆ 8e8       ┆ 8e8       ┆ 58e8     │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           ┆

#### 2.3 聚合数据

In [13]:
# 填充完成之后，可能会存在重复消费的情况，需要把这部分数据聚合起来
train_data_offline = train_data_offline.group_by([
    "user_id", "article_id", "expose_time", "network_env", 
    "refresh_count", "expose_pos", "is_clicked"
]).agg(
    pl.when(pl.col("is_clicked") == 1)  # 只计算有效时长
    .then(pl.col("duration"))
    .otherwise(0)  # 其他情况，duration 视为 0
    .sum()
    .alias("duration")
)

val_data_offline = val_data_offline.group_by([
    "user_id", "article_id", "expose_time", "network_env", 
    "refresh_count", "expose_pos", "is_clicked"
]).agg(
    pl.when(pl.col("is_clicked") == 1)  # 只计算有效时长
    .then(pl.col("duration"))
    .otherwise(0)  # 其他情况，duration 视为 0
    .sum()
    .alias("duration")
)

test_data_online = test_data_online.group_by([
    "user_id", "article_id", "expose_time", "network_env", 
    "refresh_count", "expose_pos", "is_clicked"
]).agg(
    pl.when(pl.col("is_clicked") == 1)  # 只计算有效时长
    .then(pl.col("duration"))
    .otherwise(0)  # 其他情况，duration 视为 0
    .sum()
    .alias("duration")
)

In [14]:
# 存在一条记录又被记录为点击又被记录为未点击的情况，所以需要聚合起来(在is_clicked 和 duration上加和就可以了)
# 这是因为发现一条点击数据对应一条相同的未点击数据
train_data_offline = train_data_offline.group_by([
    "user_id", "article_id", "expose_time", 
    "network_env", "refresh_count", "expose_pos"
]).agg(
    pl.col("is_clicked").sum().alias("is_clicked"),  # is_clicked 列求和
    pl.col("duration").sum().alias("duration")       # duration 列求和
)

val_data_offline = val_data_offline.group_by([
    "user_id", "article_id", "expose_time", 
    "network_env", "refresh_count", "expose_pos"
]).agg(
    pl.col("is_clicked").sum().alias("is_clicked"),  # is_clicked 列求和
    pl.col("duration").sum().alias("duration")       # duration 列求和
)


test_data_online = test_data_online.group_by([
    "user_id", "article_id", "expose_time", 
    "network_env", "refresh_count", "expose_pos"
]).agg(
    pl.col("is_clicked").sum().alias("is_clicked"),  # is_clicked 列求和
    pl.col("duration").sum().alias("duration")       # duration 列求和
)

### 3. 将长尾分布压缩为正态分布

#### 3.1 处理duration

In [15]:
import scipy.stats as stats
import numpy as np
import pandas as pd

In [16]:
# 只处理is_clicked为1的数据
train_data_filtered = train_data_offline.filter(pl.col("is_clicked") == 1)
val_data_filtered = val_data_offline.filter(pl.col("is_clicked") == 1)
test_data_filtered = test_data_online.filter(pl.col("is_clicked") == 1)

In [17]:
# 计算 train_data_filtered 的 boxcox 变换及 lambda 参数
train_duration_np = train_data_filtered["duration"].to_numpy()
train_boxcox, boxcox_lambda = stats.boxcox(train_duration_np + 1)  # 避免 log(0) 问题

# 应用变换到 train_data_filtered
train_data_filtered = train_data_filtered.with_columns(
    pl.Series(train_boxcox).alias("duration_transformed_box")
)

# 在 val_data_filtered 和 test_data_filtered 上应用相同的 lambda 参数
val_data_filtered = val_data_filtered.with_columns(
    pl.Series(stats.boxcox(val_data_filtered["duration"].to_numpy() + 1, lmbda=boxcox_lambda)).alias("duration_transformed_box")
)

test_data_filtered = test_data_filtered.with_columns(
    pl.Series(stats.boxcox(test_data_filtered["duration"].to_numpy() + 1, lmbda=boxcox_lambda)).alias("duration_transformed_box")
)

In [18]:
# 查看 boxcox 变换的效果
print(train_data_filtered.select("duration", "duration_transformed_box").describe())
print(val_data_filtered.select("duration", "duration_transformed_box").describe())
print(test_data_filtered.select("duration", "duration_transformed_box").describe())

shape: (9, 3)
┌────────────┬─────────────┬──────────────────────────┐
│ statistic  ┆ duration    ┆ duration_transformed_box │
│ ---        ┆ ---         ┆ ---                      │
│ str        ┆ f64         ┆ f64                      │
╞════════════╪═════════════╪══════════════════════════╡
│ count      ┆ 2.2747279e7 ┆ 2.2747279e7              │
│ null_count ┆ 0.0         ┆ 0.0                      │
│ mean       ┆ 151.870764  ┆ 8.500939                 │
│ std        ┆ 162.829197  ┆ 3.364397                 │
│ min        ┆ 1.0         ┆ 0.75407                  │
│ 25%        ┆ 45.0        ┆ 6.273527                 │
│ 50%        ┆ 109.0       ┆ 8.701629                 │
│ 75%        ┆ 204.0       ┆ 10.773692                │
│ max        ┆ 3565.0      ┆ 25.469292                │
└────────────┴─────────────┴──────────────────────────┘
shape: (9, 3)
┌────────────┬────────────┬──────────────────────────┐
│ statistic  ┆ duration   ┆ duration_transformed_box │
│ ---        ┆ ---    

In [19]:
from scipy.stats import kurtosis, skew

# 定义一个函数计算偏度和峰度
def calc_skew_kurtosis(series, name):
    skewness = skew(series, nan_policy='omit')
    kurt = kurtosis(series, nan_policy='omit')
    print(f"{name} 的偏度 (Skewness): {skewness:.4f}", f"{name} 的峰度 (Kurtosis): {kurt:.4f}\n")

# 计算变换前的偏度和峰度
calc_skew_kurtosis(train_data_filtered["duration"].to_numpy(), "训练集原始数据")
calc_skew_kurtosis(val_data_filtered["duration"].to_numpy(), "验证集原始数据")
calc_skew_kurtosis(test_data_filtered["duration"].to_numpy(), "测试集原始数据")

# 计算变换后的偏度和峰度
calc_skew_kurtosis(train_data_filtered["duration_transformed_box"].to_numpy(), "训练集 Box-Cox 变换")
calc_skew_kurtosis(val_data_filtered["duration_transformed_box"].to_numpy(), "验证集 Box-Cox 变换")
calc_skew_kurtosis(test_data_filtered["duration_transformed_box"].to_numpy(), "测试集 Box-Cox 变换")

训练集原始数据 的偏度 (Skewness): 3.2896 训练集原始数据 的峰度 (Kurtosis): 22.7731

验证集原始数据 的偏度 (Skewness): 3.1356 验证集原始数据 的峰度 (Kurtosis): 19.7851

测试集原始数据 的偏度 (Skewness): 3.2267 测试集原始数据 的峰度 (Kurtosis): 21.7571

训练集 Box-Cox 变换 的偏度 (Skewness): -0.0276 训练集 Box-Cox 变换 的峰度 (Kurtosis): -0.1048

验证集 Box-Cox 变换 的偏度 (Skewness): -0.0165 验证集 Box-Cox 变换 的峰度 (Kurtosis): -0.1460

测试集 Box-Cox 变换 的偏度 (Skewness): -0.0191 测试集 Box-Cox 变换 的峰度 (Kurtosis): -0.1430



In [20]:
# 将数据拼接回去，并将null置为0
train_data_offline = train_data_offline.join(
    train_data_filtered.select(["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked", "duration_transformed_box"]),
    on=["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked"],
    how="left"
).fill_null(0)

val_data_offline = val_data_offline.join(
    val_data_filtered.select(["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked", "duration_transformed_box"]),
    on=["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked"],
    how="left"
).fill_null(0)

test_data_online = test_data_online.join(
    test_data_filtered.select(["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked", "duration_transformed_box"]),
    on=["user_id", "article_id", "expose_time", "network_env", "refresh_count", "expose_pos", "is_clicked"],
    how="left"
).fill_null(0)

#### 3.2 处理refresh_count

In [21]:
# 计算 train_data_offline 的 boxcox 变换及 lambda 参数
train_refresh_count_np = train_data_offline["refresh_count"].to_numpy()
train_boxcox, boxcox_lambda = stats.boxcox(train_refresh_count_np + 1)  # 避免 log(0) 问题

# 应用变换到 train_data_offline
train_data_offline = train_data_offline.with_columns(
    pl.Series(train_boxcox).alias("refresh_count_transformed_box")
)

# 在 val_data_offline 和 test_data_online 上应用相同的 lambda 参数
val_data_offline = val_data_offline.with_columns(
    pl.Series(stats.boxcox(val_data_offline["refresh_count"].to_numpy() + 1, lmbda=boxcox_lambda)).alias("refresh_count_transformed_box")
)

test_data_online = test_data_online.with_columns(
    pl.Series(stats.boxcox(test_data_online["refresh_count"].to_numpy() + 1, lmbda=boxcox_lambda)).alias("refresh_count_transformed_box")
)

In [22]:
# 查看 boxcox 变换的效果
print(train_data_offline.select("refresh_count", "refresh_count_transformed_box").describe())
print(val_data_offline.select("refresh_count", "refresh_count_transformed_box").describe())
print(test_data_online.select("refresh_count", "refresh_count_transformed_box").describe())

shape: (9, 3)
┌────────────┬───────────────┬───────────────────────────────┐
│ statistic  ┆ refresh_count ┆ refresh_count_transformed_box │
│ ---        ┆ ---           ┆ ---                           │
│ str        ┆ f64           ┆ f64                           │
╞════════════╪═══════════════╪═══════════════════════════════╡
│ count      ┆ 1.59283939e8  ┆ 1.59283939e8                  │
│ null_count ┆ 0.0           ┆ 0.0                           │
│ mean       ┆ 8.951377      ┆ 1.559789                      │
│ std        ┆ 15.037508     ┆ 0.872285                      │
│ min        ┆ 0.0           ┆ 0.0                           │
│ 25%        ┆ 2.0           ┆ 1.04409                       │
│ 50%        ┆ 4.0           ┆ 1.494234                      │
│ 75%        ┆ 11.0          ┆ 2.217441                      │
│ max        ┆ 650.0         ┆ 4.859617                      │
└────────────┴───────────────┴───────────────────────────────┘
shape: (9, 3)
┌────────────┬─────────────

In [23]:
from scipy.stats import kurtosis, skew

# 定义一个函数计算偏度和峰度
def calc_skew_kurtosis(series, name):
    skewness = skew(series, nan_policy='omit')
    kurt = kurtosis(series, nan_policy='omit')
    print(f"{name} 的偏度 (Skewness): {skewness:.4f}", f"{name} 的峰度 (Kurtosis): {kurt:.4f}\n")

# 计算变换前的偏度和峰度
calc_skew_kurtosis(train_data_offline["refresh_count"].to_numpy(), "训练集原始数据")
calc_skew_kurtosis(val_data_offline["refresh_count"].to_numpy(), "验证集原始数据")
calc_skew_kurtosis(test_data_online["refresh_count"].to_numpy(), "测试集原始数据")

# 计算变换后的偏度和峰度
calc_skew_kurtosis(train_data_offline["refresh_count_transformed_box"].to_numpy(), "训练集 Box-Cox 变换")
calc_skew_kurtosis(val_data_offline["refresh_count_transformed_box"].to_numpy(), "验证集 Box-Cox 变换")
calc_skew_kurtosis(test_data_online["refresh_count_transformed_box"].to_numpy(), "测试集 Box-Cox 变换")

训练集原始数据 的偏度 (Skewness): 8.0006 训练集原始数据 的峰度 (Kurtosis): 142.7083

验证集原始数据 的偏度 (Skewness): 7.2799 验证集原始数据 的峰度 (Kurtosis): 110.2773

测试集原始数据 的偏度 (Skewness): 8.1811 测试集原始数据 的峰度 (Kurtosis): 141.2875

训练集 Box-Cox 变换 的偏度 (Skewness): 0.0161 训练集 Box-Cox 变换 的峰度 (Kurtosis): -0.5207

验证集 Box-Cox 变换 的偏度 (Skewness): 0.0291 验证集 Box-Cox 变换 的峰度 (Kurtosis): -0.4522

测试集 Box-Cox 变换 的偏度 (Skewness): 0.0267 测试集 Box-Cox 变换 的峰度 (Kurtosis): -0.4584



#### 3.3 对refresh_count, duration, expose_time 进行Min-Max 标准化

In [24]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# 定义 Min-Max 标准化函数
def min_max_scale(df, columns, scaler=None):
    if scaler is None:
        scaler = MinMaxScaler()
        scaler.fit(df.select(columns).to_numpy())
    
    scaled_values = scaler.transform(df.select(columns).to_numpy())
    
    return df.with_columns([
        pl.Series(scaled_values[:, i]).alias(f"{col}")
        for i, col in enumerate(columns)
    ]), scaler

# 选择需要标准化的列
columns_to_scale = ["refresh_count_transformed_box", "duration_transformed_box"]

# 对 train_data_offline 进行标准化
train_data_offline, scaler = min_max_scale(train_data_offline, columns_to_scale)

# 使用相同的 scaler 对 val_data_offline 和 test_data_online 进行标准化
val_data_offline, _ = min_max_scale(val_data_offline, columns_to_scale, scaler)
test_data_online, _ = min_max_scale(test_data_online, columns_to_scale, scaler)

# 查看标准化后的结果
print(train_data_offline.select([f"{col}" for col in columns_to_scale]).describe())
print(val_data_offline.select([f"{col}" for col in columns_to_scale]).describe())
print(test_data_online.select([f"{col}" for col in columns_to_scale]).describe())

shape: (9, 3)
┌────────────┬───────────────────────────────┬──────────────────────────┐
│ statistic  ┆ refresh_count_transformed_box ┆ duration_transformed_box │
│ ---        ┆ ---                           ┆ ---                      │
│ str        ┆ f64                           ┆ f64                      │
╞════════════╪═══════════════════════════════╪══════════════════════════╡
│ count      ┆ 1.59283939e8                  ┆ 1.59283939e8             │
│ null_count ┆ 0.0                           ┆ 0.0                      │
│ mean       ┆ 0.32097                       ┆ 0.047666                 │
│ std        ┆ 0.179497                      ┆ 0.127002                 │
│ min        ┆ 0.0                           ┆ 0.0                      │
│ 25%        ┆ 0.21485                       ┆ 0.0                      │
│ 50%        ┆ 0.30748                       ┆ 0.0                      │
│ 75%        ┆ 0.4563                        ┆ 0.0                      │
│ max        ┆ 1.0      

### 4. 保存数据

#### 4.1 线下数据

In [25]:
# 保存数据
offline_path = "/data1/zxh/news_rec/offline_data" 

train_data_offline.write_ipc(f"{offline_path}/train_data_offline.ipc")
val_data_offline.write_ipc(f"{offline_path}/val_data_offline.ipc")

#### 4.2 线上数据

In [26]:
online_path = "/data1/zxh/news_rec/online_data"

train_data_online = pl.concat([train_data_offline, val_data_offline], how="vertical")

train_data_online.write_ipc(f"{online_path}/train_data_online.ipc")
test_data_online.write_ipc(f"{online_path}/test_data_online.ipc")