### 将所有属性整合起来变成一条条的训练数据

### 1. 导入包

In [1]:
import pickle
import polars as pl
from tqdm import tqdm
# 训练集上得到的数据，所有缺失值的填充要按照这个字典来
FILL_VALUES = {
    'docid_history_count': 0.3197370642850515,
    'docid_expose_count': 0.35353441318778134,
    'docid_ctr': 0.006392260064240436,
    'docid_history_duration_mean': 0.28339285217401183,
    'category1_ctr': 0.13570003591442992,
    'category1_popularity': 0.036799390379834,
    'category1_history_duration_mean': 0.32981757714259613,
    'category2_ctr': 0.13293362050565613,
    'category2_popularity': 0.007456100573611463,
    'category2_history_duration_mean': 0.32639158456815426,
    'userid_category1_ctr': 0.25,
    'userid_category1_history_duration_mean': 0.34075001249075343,
    'userid_category2_ctr': 0.3333333333333333,
    'userid_category2_history_duration_mean': 0.3396034216144695,
    'userid_history_duration_mean': 0.3472170022795658,
    'userid_history_count': 0.33148076049294156,
    'userid_expose_count': 0.3488569631166438,
    'userid_ctr': 0.036223160969787456
 }

In [2]:
# 设置文件目录
offline = True
mode = "offline" if offline else "online"
data_path = f"/data3/zxh/news_rec/{mode}_data"
public_path = "/data3/zxh/news_rec/public_data"

### 2. 处理 item 数据

#### 2.1 keywords数据

In [3]:
doc_keywords = pl.read_ipc(f"{public_path}/doc_keywords_feature.ipc")
doc_keywords.describe()

statistic,article_id,keywords
str,f64,str
"""count""",622407.0,"""622407"""
"""null_count""",0.0,"""0"""
"""mean""",464630000.0,
"""std""",1565300.0,
"""min""",334493096.0,"""#^^0000ff:0.032714,0000ff:0.03…"
"""25%""",463450810.0,
"""50%""",464632474.0,
"""75%""",465821561.0,
"""max""",467278131.0,"""龙泉:1.000000"""


#### 2.2 处理 item 的 sparse 特征

In [4]:
doc_sparse_feature = pl.read_ipc(f"{public_path}/doc_sparse_feature.ipc")
doc_sparse_feature.describe()

statistic,article_id,publish_time,image_count,category_level1,category_level2
str,f64,f64,str,str,str
"""count""",633388.0,633146.0,"""633388""","""633388""","""633388"""
"""null_count""",0.0,242.0,"""0""","""0""","""0"""
"""mean""",464620000.0,1625000000000.0,,,
"""std""",1576900.0,448080000.0,,,
"""min""",325279629.0,1563400000000.0,"""""","""""",""""""
"""25%""",463448490.0,1624700000000.0,,,
"""50%""",464618782.0,1625000000000.0,,,
"""75%""",465814182.0,1625300000000.0,,,
"""max""",467278131.0,1625700000000.0,"""4""","""颜值才艺""","""颜值才艺/男神"""


#### 2.3 处理 item 的 category1 信息

In [5]:
doc_category1_stats = pl.read_ipc(f"{data_path}/doc_category1_stats_{mode}.ipc").drop(["category1_history_count"])
doc_category1_stats.describe()

statistic,category_level1,category1_ctr,category1_popularity,category1_history_duration_mean
str,str,f64,f64,f64
"""count""","""39""",39.0,39.0,38.0
"""null_count""","""0""",0.0,0.0,1.0
"""mean""",,0.130567,0.025641,0.323439
"""std""",,0.031055,0.029587,0.03478
"""min""","""""",0.0,0.0,0.194861
"""25%""",,0.117285,0.002096,0.31536
"""50%""",,0.129982,0.016115,0.325716
"""75%""",,0.145029,0.03891,0.342276
"""max""","""颜值才艺""",0.203812,0.108771,0.393342


#### 2.4 处理 item 的 category2 信息

In [6]:
doc_category2_stats = pl.read_ipc(f"{data_path}/doc_category2_stats_{mode}.ipc").drop(["category2_history_count"])
doc_category2_stats

category_level2,category2_ctr,category2_popularity,category2_history_duration_mean
str,f64,f64,f64
"""情感/婚姻与家庭""",0.15524,0.082245,0.378103
"""娱乐/内地明星""",0.135735,0.076843,0.342274
"""军事/军事新闻""",0.170171,0.040931,0.343934
"""健康/疾病防护治疗及西医用药""",0.128846,0.039994,0.329711
"""星座运势/风水与算命""",0.201543,0.036544,0.251788
…,…,…,…
"""综艺/欧美综艺""",0.0,0.0,
"""音乐/韩国音乐""",0.0,0.0,
"""颜值才艺/民间大神""",0.0,0.0,
"""正能量/慈善公益""",0.0,0.0,


#### 2.5 处理doc_ctr

In [7]:
doc_ctr = pl.read_ipc(f"{data_path}/doc_ctr_{mode}.ipc")
doc_ctr = doc_ctr.with_columns(
    pl.col("docid_wilson_ctr").alias("docid_ctr"),
    pl.col("docid_expose_count_transformed_box").alias("docid_expose_count"),
    pl.col("docid_history_count_transformed_box").alias("docid_history_count")
).drop(["docid_wilson_ctr", "docid_expose_count_transformed_box", "docid_history_count_transformed_box"])
doc_ctr.describe()

statistic,article_id,docid_history_count,docid_expose_count,docid_ctr
str,f64,f64,f64,f64
"""count""",510422.0,510422.0,510422.0,510422.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",464140000.0,0.321515,0.368326,0.042859
"""std""",1353400.0,0.350572,0.272477,0.065692
"""min""",325279629.0,0.0,0.0,-0.000457
"""25%""",463212725.0,0.0,0.103044,0.0
"""50%""",464168668.0,0.319737,0.353534,0.006393
"""75%""",465093240.0,0.631201,0.591203,0.065915
"""max""",466163727.0,1.0,1.0,0.639767


#### 2.6 处理doc_duration_mean

In [8]:
doc_duration_mean = pl.read_ipc(f"{data_path}/doc_duration_mean_{mode}.ipc")
doc_duration_mean

article_id,docid_history_duration_mean
i64,f64
465886092,0.240756
465139463,0.234646
463371324,0.248754
463798514,0.314775
463028025,0.219942
…,…
466054392,0.333897
464730092,0.304147
464346290,0.342493
462280997,0.358856


#### 2.7 合并为 doc_columns

In [9]:
doc_columns = (
    doc_keywords.join(doc_sparse_feature, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id")
    )
    .drop("article_id_right")  # 如果不需要，删除临时列
)

doc_columns = (
    doc_columns.join(doc_ctr, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id"),
        pl.col("category_level1").fill_null(""),
        pl.col("category_level2").fill_null("")
    )
    .drop("article_id_right")  # 如果不需要，删除临时列
)

doc_columns = (
    doc_columns.join(doc_duration_mean, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id"),
    )
    .drop("article_id_right")  # 删除临时列
)

doc_columns = (
    doc_columns
    .join(doc_category1_stats, on="category_level1", how="left")
    .join(doc_category2_stats, on="category_level2", how="left")
)


doc_columns.describe()

statistic,article_id,keywords,publish_time,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean
str,f64,str,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",633391.0,"""622407""",633146.0,"""633388""","""633391""","""633391""",510422.0,510422.0,510422.0,263897.0,633391.0,633391.0,633371.0,633385.0,633385.0,633317.0
"""null_count""",0.0,"""10984""",245.0,"""3""","""0""","""0""",122969.0,122969.0,122969.0,369494.0,0.0,0.0,20.0,6.0,6.0,74.0
"""mean""",464620000.0,,1625000000000.0,,,,0.321515,0.368326,0.042859,0.276945,0.139,0.044158,0.324469,0.135475,0.015962,0.319449
"""std""",1576900.0,,448080000.0,,,,0.350572,0.272477,0.065692,0.092258,0.021447,0.032997,0.035638,0.031786,0.020485,0.03857
"""min""",325279629.0,"""#^^0000ff:0.032714,0000ff:0.03…",1563400000000.0,"""""","""""","""""",0.0,0.0,-0.000457,0.029607,0.0,0.0,0.194861,0.0,0.0,0.064567
"""25%""",463448492.0,,1624700000000.0,,,,0.0,0.103044,0.0,0.216367,0.12763,0.018259,0.31536,0.118006,0.003806,0.302457
"""50%""",464618782.0,,1625000000000.0,,,,0.319737,0.353534,0.006393,0.283393,0.1357,0.036799,0.329818,0.132934,0.007456,0.326392
"""75%""",465814182.0,,1625300000000.0,,,,0.631201,0.591203,0.065915,0.338884,0.145029,0.073908,0.342276,0.14895,0.023202,0.341365
"""max""",467278131.0,"""龙泉:1.000000""",1625700000000.0,"""4""","""颜值才艺""","""颜值才艺/男神""",1.0,1.0,0.639767,0.959917,0.203812,0.108771,0.393342,0.5,0.082245,0.43561


In [10]:
doc_columns

article_id,keywords,publish_time,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean
i64,str,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
349635709,"""上班族:0.052498,买车:0.050440,二手车:0…",1572519971000,"""4""","""汽车""","""汽车/用车""",0.0,0.0,0.0,,0.117285,0.018259,0.307542,0.114759,0.005196,0.326605
361653323,"""医生:0.133734,吸烟:0.149266,板蓝根:0.…",1624522285000,"""1""","""健康""","""健康/疾病防护治疗及西医用药""",0.0,0.528662,-1.2302e-17,,0.1361,0.075243,0.3257,0.128846,0.039994,0.329711
426732705,"""155n:0.033340,polo:0.029521,中控…",1610808303000,"""4""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.117285,0.018259,0.307542,0.126386,0.007456,0.28925
430221183,"""etc:0.038040,代表:0.028015,内饰:0.…",1612581556000,"""2""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.117285,0.018259,0.307542,0.126386,0.007456,0.28925
441756326,"""丰田凯美瑞:0.089051,充电器:0.058525,品牌…",1618825835000,"""4""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.117285,0.018259,0.307542,0.126386,0.007456,0.28925
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
467278115,"""cj:0.048329,三巨头:0.027688,争冠:0.…",1625672111000,"""4""","""体育""","""体育/NBA""",,,,,0.171307,0.086241,0.329818,0.200853,0.02414,0.328701
467278124,"""t恤:0.031757,主理人:0.025905,优雅:0.…",1625672116000,"""4""","""时尚""","""时尚/明星时尚""",,,,,0.14042,0.028536,0.194861,0.096845,0.006064,0.222068
463642111,,,,"""""","""""",0.591687,0.504615,0.064033,0.200082,0.107493,0.000024,0.316754,0.107472,0.000024,0.316754
465493907,,,,"""""","""""",0.0,0.256481,-3.1392e-17,,0.107493,0.000024,0.316754,0.107472,0.000024,0.316754


### 3. 处理 user 数据

#### 3.1 处理 user_sparse

In [11]:
user_sparse_feature = pl.read_ipc(f"{public_path}/user_sparse_feature.ipc")

#### 3.2 处理 user_duration_mean

In [12]:
user_duration_mean = pl.read_ipc(f"{data_path}/user_duration_mean_{mode}.ipc")
user_duration_mean

user_id,userid_history_duration_mean
i64,f64
2428862566,0.329776
1628826228,0.261558
2425081826,0.340964
2327646976,0.220498
2420880976,0.300819
…,…
2309825734,0.394413
2395993612,0.453946
1515669388,0.246478
2393402292,0.377204


#### 3.3 处理 user_category1_stats

In [13]:
user_category1_stats = pl.read_ipc(f"{data_path}/user_category1_stats_{mode}.ipc")
user_category1_stats

user_id,category_level1,userid_category1_history_count,userid_category1_ctr,userid_category1_history_duration_mean
i64,str,str,f64,f64
2392908568,"""文化艺术""","""2""",0.333333,0.431503
1677034496,"""案件""","""1""",1.0,0.434322
2408807906,"""娱乐""","""1""",0.5,0.46306
2229132736,"""情感""","""2""",0.010526,0.254821
1283875834,"""科学""","""1""",1.0,0.352306
…,…,…,…,…
1395900392,"""农村""","""1""",0.5,0.264268
2137389826,"""国内""","""1""",0.333333,0.34384
1346437114,"""健康""","""2""",0.4,0.408096
2284854470,"""财经""","""1""",0.333333,0.366142


#### 3.4 处理 user_category2_stats

In [14]:
user_category2_stats = pl.read_ipc(f"{data_path}/user_category2_stats_{mode}.ipc")
user_category2_stats

user_id,category_level2,userid_category2_history_count,userid_category2_ctr,userid_category2_history_duration_mean
i64,str,str,f64,f64
2416668718,"""国际/国际事件""","""2""",0.666667,0.251545
2413148662,"""娱乐/内地明星""","""2""",0.25,0.417218
2370463616,"""育儿/儿童健康与安全""","""3""",0.3125,0.485823
2432203626,"""综艺/内地综艺""","""1""",0.333333,0.087878
2209119430,"""美食/美酒""","""1""",0.1,0.303089
…,…,…,…,…
509048864,"""健康/中医药""","""3""",0.153846,0.270139
1466667146,"""军事/武器""","""3""",0.444444,0.360275
2289702632,"""时尚/女性时尚""","""1""",0.2,0.19389
1411826622,"""情感/婚姻与家庭""","""1""",0.5,0.448627


#### 3.5 处理 user_ctr

In [15]:
user_ctr = pl.read_ipc(f"{data_path}/user_ctr_{mode}.ipc")
user_ctr = user_ctr.with_columns(
    pl.col("userid_wilson_ctr").alias("userid_ctr"),
    pl.col("userid_expose_count_transformed_box").alias("userid_expose_count"),
    pl.col("userid_history_count_transformed_box").alias("userid_history_count")
).drop(["userid_wilson_ctr", "userid_expose_count_transformed_box", "userid_history_count_transformed_box"])
user_ctr.describe()

statistic,user_id,userid_history_count,userid_expose_count,userid_ctr
str,f64,f64,f64,f64
"""count""",1354232.0,1354232.0,1354232.0,1354232.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",1977200000.0,0.341694,0.352287,0.074834
"""std""",482130000.0,0.309706,0.259966,0.097211
"""min""",17340.0,0.0,0.0,-3.1392e-17
"""25%""",1561600000.0,0.0,0.079144,0.0
"""50%""",2215200000.0,0.331481,0.348857,0.036223
"""75%""",2409600000.0,0.631879,0.576362,0.117301
"""max""",2447100000.0,1.0,1.0,0.860241


In [16]:
user_ctr

user_id,userid_history_count,userid_expose_count,userid_ctr
i64,f64,f64,f64
2440268878,0.0,0.229427,0.0
2205581550,0.44868,0.421419,0.066786
1617945812,0.853049,0.813712,0.081448
1333304062,0.485887,0.647512,0.012778
1687507300,0.0,0.0,0.0
…,…,…,…
1636920014,0.44868,0.327487,0.138118
2446151344,0.223077,0.57892,0.002009
1617225520,0.816019,0.763416,0.091939
1610473070,0.921774,0.812951,0.270241


#### 3.6 合并为 训练数据

In [17]:
train_data = pl.read_ipc(f"{data_path}/train_data_{mode}.ipc")
train_data = train_data.with_columns(
    pl.col("duration_transformed_box").alias("duration"),
    pl.col("refresh_count_transformed_box").alias("refresh_count"),
).drop(["duration_transformed_box", "refresh_count_transformed_box"])
train_data

user_id,article_id,expose_time,network_env,refresh_count,expose_pos,is_clicked,duration
i64,i64,i64,i64,f64,i64,i64,f64
2317493554,462921731,1624632635106,2,0.469308,111,0,0.0
2440811304,464761325,1625148494143,2,0.0,16,0,0.0
1489396294,463086468,1624730702499,2,0.426303,12,0,0.0
2378924896,463542557,1624779192681,2,0.0,17,1,0.516152
1607562824,466018342,1625438899443,2,0.30748,47,0,0.0
…,…,…,…,…,…,…,…
2207664466,465969570,1625440853917,2,0.565787,48,0,0.0
2389035156,462689382,1624594863130,5,0.339484,12,0,0.0
2439009808,464021624,1624891814619,2,0.589187,71,0,0.0
2217371040,462652228,1624695820851,5,0.366121,12,1,0.487154


In [18]:
train_data = (
    train_data
    .join(user_sparse_feature, on="user_id", how="left")
    .join(user_duration_mean, on="user_id", how="left")
    .join(user_ctr, on="user_id", how="left")
    .join(doc_columns, on="article_id", how="left")
    .join(user_category1_stats, on=["user_id", "category_level1"], how="left")
    .join(user_category2_stats, on=["user_id", "category_level2"], how="left")
)

# 处理出版时间 和 统计特征的缺失值（使用中位数填充）
train_data = train_data.with_columns([
    pl.col(col).fill_null(value) for col, value in FILL_VALUES.items() if col in train_data.columns
] + [
    pl.col("publish_time").fill_null(pl.col("publish_time").median())  # 处理 publish_time 单独计算中位数
])

# 计算小时级差距（毫秒差 / 3600000），生成转换标签(is_converted)
train_data = train_data.with_columns(
    (abs(train_data["expose_time"] - train_data["publish_time"]) // 3_600_000).cast(pl.Int32).alias("expose_hourdiff"),
    (train_data["expose_time"].cast(pl.Datetime("ms")).dt.hour()).alias("expose_hour"),
    (train_data["duration"] >= train_data["docid_history_duration_mean"]).cast(pl.Int8).alias("is_converted")
).drop(["publish_time"])
train_data

user_id,article_id,expose_time,network_env,refresh_count,expose_pos,is_clicked,duration,device_name,os,province,city,age,gender,userid_history_duration_mean,userid_history_count,userid_expose_count,userid_ctr,keywords,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean,userid_category1_history_count,userid_category1_ctr,userid_category1_history_duration_mean,userid_category2_history_count,userid_category2_ctr,userid_category2_history_duration_mean,expose_hourdiff,expose_hour,is_converted
i64,i64,i64,i64,f64,i64,i64,f64,str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,i32,i8,i8
2317493554,462921731,1624632635106,2,0.469308,111,0,0.0,"""ARE-AL10""","""Android""","""广东""","""中山""","""A_0_24:0.157357,A_25_29:0.2262…","""female:0.482044,male:0.517956""",0.297346,0.820895,0.850557,0.031359,"""a53:0.034701,a55:0.032022,cpu:…","""4""","""科技""","""科技/数码产品""",0.95027,0.899628,0.047699,0.346577,0.140858,0.022642,0.316223,0.072288,0.001068,0.290893,"""2""",0.05,0.315946,,0.333333,0.339603,29,14,0
2440811304,464761325,1625148494143,2,0.0,16,0,0.0,"""PBFM00""","""Android""","""河北""","""邯郸""","""A_0_24:0.117757,A_25_29:0.0378…","""female:1.0,male:0.0""",0.297753,0.594452,0.535231,0.093131,"""五官:0.018150,保养:0.027471,修图:0.0…","""3""","""娱乐""","""娱乐/内地明星""",0.986718,0.936446,0.150558,0.232901,0.1357,0.101556,0.342276,0.135735,0.076843,0.342274,"""2""",0.105263,0.336492,,0.333333,0.339603,34,14,0
1489396294,463086468,1624730702499,2,0.426303,12,0,0.0,"""PBAM00""","""Android""","""山西""","""大同""","""A_0_24:0.269228,A_25_29:0.0745…","""female:0.0,male:1.0""",0.366736,0.651339,0.628817,0.066012,"""一国两制:0.033800,中央政府:0.040037,传媒…","""1""","""国内""","""国内/港澳台""",0.995593,0.972368,0.154125,0.27921,0.12763,0.03891,0.311547,0.148351,0.013164,0.318563,"""1""",0.066667,0.325422,,0.333333,0.339603,37,18,0
2378924896,463542557,1624779192681,2,0.0,17,1,0.516152,"""HLK-AL00""","""Android""","""辽宁""","""大连""","""A_0_24:0.041074,A_25_29:0.0335…","""female:0.448847,male:0.551153""",0.363969,0.816019,0.637311,0.370589,"""健康:0.035787,养狗:0.050002,天天:0.0…","""4""","""宠物""","""宠物/宠物狗""",0.985059,0.95428,0.066942,0.383461,0.110343,0.00153,0.324651,0.105987,0.001138,0.333547,"""1""",0.2,0.516152,"""1""",0.25,0.516152,19,7,1
1607562824,466018342,1625438899443,2,0.30748,47,0,0.0,"""PBCM10""","""Android""","""河北""","""石家庄""","""A_0_24:0.776097,A_25_29:0.0909…","""female:0.707423,male:0.292577""",0.278953,0.793707,0.713613,0.123409,"""ktv:0.044719,交房:0.041619,儿童房:0…","""4""","""生活""","""生活/家居""",0.888204,0.752088,0.139554,0.294483,0.120128,0.010926,0.324877,0.126324,0.005211,0.315534,"""2""",0.074074,0.295731,"""2""",0.083333,0.295731,17,22,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2207664466,465969570,1625440853917,2,0.565787,48,0,0.0,"""PACM00""","""Android""","""江西""","""宜春""","""A_0_24:0.09648,A_25_29:0.18005…","""female:0.049749,male:0.950251""",0.268773,0.801979,0.831539,0.032069,"""led:0.032229,suv:0.031098,亚洲龙:…","""4""","""汽车""","""汽车/汽车资讯""",0.800729,0.681905,0.091145,0.299059,0.117285,0.018259,0.307542,0.109067,0.005608,0.314201,"""3""",0.070336,0.271314,"""2""",0.034483,0.324509,19,23,0
2389035156,462689382,1624594863130,5,0.339484,12,0,0.0,"""V2023A""","""Android""","""辽宁""","""沈阳""","""A_0_24:0.030006,A_25_29:0.4185…","""female:0.0,male:1.0""",0.379414,0.90848,0.818934,0.191759,"""中山服:0.017268,伟人:0.026915,南宁:0.…","""4""","""娱乐""","""娱乐/内地明星""",0.994006,0.972543,0.108115,0.418052,0.1357,0.101556,0.342276,0.135735,0.076843,0.342274,"""3""",0.166667,0.315576,"""3""",0.148936,0.286464,24,4,0
2439009808,464021624,1624891814619,2,0.589187,71,0,0.0,"""OPPOA83""","""Android""","""广东""","""河源""","""A_0_24:0.091429,A_25_29:0.0567…","""female:0.448526,male:0.551474""",0.283166,0.746262,0.785496,0.030597,"""为人处世:0.044559,主权:0.059941,化干戈为…","""3""","""星座运势""","""星座运势/星座与占卜""",0.319737,0.2871,0.030053,0.105847,0.203812,0.040131,0.248182,0.230204,0.003587,0.211447,"""3""",0.095238,0.272323,,0.333333,0.339603,12,14,0
2217371040,462652228,1624695820851,5,0.366121,12,1,0.487154,"""PBFM00""","""Android""","""河南""","""南阳""","""A_0_24:0.331022,A_25_29:0.2138…","""female:0.185992,male:0.814008""",0.433817,0.869543,0.698619,0.421443,"""保密局:0.023960,兰州:0.022771,农民:0.…","""3""","""历史""","""历史/中国史""",0.976103,0.913046,0.12025,0.463448,0.128401,0.036799,0.393342,0.124567,0.030694,0.399198,"""3""",0.530303,0.486017,"""3""",0.533333,0.495366,53,8,1


#### 3.7 合并为 验证数据

In [19]:
val_data = pl.read_ipc(f"{data_path}/val_data_{mode}.ipc")
val_data = val_data.with_columns(
    pl.col("duration_transformed_box").alias("duration"),
    pl.col("refresh_count_transformed_box").alias("refresh_count"),
).drop(["duration_transformed_box", "refresh_count_transformed_box"])
val_data

user_id,article_id,expose_time,network_env,refresh_count,expose_pos,is_clicked,duration
i64,i64,i64,i64,f64,i64,i64,f64
2383845382,465985940,1625460168905,2,0.138112,21,0,0.0
2215407536,466378681,1625528402502,5,0.339484,27,1,0.260483
1260176960,466045087,1625446747728,5,0.0,11,0,0.0
1615719778,466155787,1625467188140,2,0.138112,17,1,0.529495
1427811750,465777995,1625458586984,5,0.388887,82,0,0.0
…,…,…,…,…,…,…,…
515410694,465561543,1625447398704,2,0.502604,19,0,0.0
2446184828,466313181,1625467927386,2,0.692836,134,0,0.0
2214642936,465783406,1625509881385,5,0.138112,20,0,0.0
2424557914,466255306,1625492184976,2,0.502604,157,0,0.0


In [20]:
val_data = (
    val_data
    .join(user_sparse_feature, on="user_id", how="left")
    .join(user_duration_mean, on="user_id", how="left")
    .join(user_ctr, on="user_id", how="left")
    .join(doc_columns, on="article_id", how="left")
    .join(user_category1_stats, on=["user_id", "category_level1"], how="left")
    .join(user_category2_stats, on=["user_id", "category_level2"], how="left")
)

# 处理出版时间 和 统计特征的缺失值（使用中位数填充）
val_data = val_data.with_columns([
    pl.col(col).fill_null(value) for col, value in FILL_VALUES.items() if col in val_data.columns
] + [
    pl.col("publish_time").fill_null(pl.col("publish_time").median())  # 处理 publish_time 单独计算中位数
])

# 计算小时级差距（毫秒差 / 3600000），生成转换标签(is_converted)
val_data = val_data.with_columns(
    (abs(val_data["expose_time"] - val_data["publish_time"]) // 3_600_000).cast(pl.Int32).alias("expose_hourdiff"),
    (val_data["expose_time"].cast(pl.Datetime("ms")).dt.hour()).alias("expose_hour"),
    (val_data["duration"] >= val_data["docid_history_duration_mean"]).cast(pl.Int8).alias("is_converted")
).drop(["publish_time"])
val_data

user_id,article_id,expose_time,network_env,refresh_count,expose_pos,is_clicked,duration,device_name,os,province,city,age,gender,userid_history_duration_mean,userid_history_count,userid_expose_count,userid_ctr,keywords,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean,userid_category1_history_count,userid_category1_ctr,userid_category1_history_duration_mean,userid_category2_history_count,userid_category2_ctr,userid_category2_history_duration_mean,expose_hourdiff,expose_hour,is_converted
i64,i64,i64,i64,f64,i64,i64,f64,str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,i32,i8,i8
2383845382,465985940,1625460168905,2,0.138112,21,0,0.0,"""PCHM10""","""Android""","""重庆""","""重庆""","""A_0_24:0.002771,A_25_29:0.0928…","""female:0.0,male:1.0""",0.227837,0.898302,0.85615,0.094584,"""中国^^航天员:0.052351,乘组:0.017802,全…","""3""","""科学""","""科学/天文与航天""",0.982858,0.967861,0.030923,0.259218,0.11512,0.029508,0.347674,0.110944,0.014636,0.345441,"""2""",0.1,0.17992,,0.333333,0.339603,24,4,0
2215407536,466378681,1625528402502,5,0.339484,27,1,0.260483,"""OPPOR11""","""Android""","""广东""","""江门""","""A_0_24:0.277652,A_25_29:0.4459…","""female:0.37388,male:0.62612""",0.3532,0.868,0.777326,0.163133,"""供人:0.057777,再见面:0.059943,出远门:0…","""4""","""情感""","""情感/恋爱""",0.319737,0.353534,0.006392,0.283393,0.157579,0.108771,0.363147,0.166305,0.025819,0.315326,"""3""",0.30198,0.404764,"""3""",0.375,0.360559,14,23,0
1260176960,466045087,1625446747728,5,0.0,11,0,0.0,"""PDHM00""","""Android""","""山东""","""烟台""","""A_0_24:0.582324,A_25_29:0.2575…","""female:0.0,male:1.0""",0.334894,0.83733,0.792809,0.085206,"""14k^^社团:0.025448,何家驹:0.028568,…","""4""","""娱乐""","""娱乐/港台明星""",0.878188,0.750653,0.118985,0.34902,0.1357,0.101556,0.342276,0.137973,0.023426,0.344025,"""3""",0.095,0.348529,"""3""",0.093023,0.366876,17,0,0
1615719778,466155787,1625467188140,2,0.138112,17,1,0.529495,"""OPPOR11t""","""Android""","""广东""","""广州""","""A_0_24:0.793802,A_25_29:0.0974…","""female:0.305407,male:0.694593""",0.438484,0.89421,0.725989,0.469599,"""中学生:0.043649,中考:0.032858,传播:0.…","""4""","""教育""","""教育/小学""",0.879769,0.728644,0.162053,0.334396,0.136538,0.073908,0.335691,0.124498,0.00633,0.326392,"""3""",0.5,0.517112,"""1""",1.0,0.487647,21,6,1
1427811750,465777995,1625458586984,5,0.388887,82,0,0.0,"""PADM00""","""Android""","""广东""","""清远""","""A_0_24:0.346819,A_25_29:0.4642…","""female:0.448829,male:0.551171""",0.264521,0.728176,0.757346,0.035519,"""上市:0.016848,云峰:0.038264,云峰^^基金…","""3""","""财经""","""财经/财经人物""",0.685976,0.613667,0.056711,0.267602,0.129982,0.024365,0.335007,0.140624,0.004679,0.354351,,0.25,0.34075,,0.333333,0.339603,44,4,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
515410694,465561543,1625447398704,2,0.502604,19,0,0.0,"""vivoX7""","""Android""","""山东""","""菏泽""","""A_0_24:0.746378,A_25_29:0.1259…","""female:0.232203,male:0.767797""",0.294714,0.810751,0.822878,0.040298,"""bug:0.019445,中生代:0.031304,主旋律:…","""4""","""电视剧""","""电视剧/内地剧""",0.968496,0.888075,0.14212,0.327064,0.128803,0.012889,0.34675,0.129062,0.012683,0.347359,,0.25,0.34075,,0.333333,0.339603,60,1,0
2446184828,466313181,1625467927386,2,0.692836,134,0,0.0,"""OPPOA77""","""Android""","""湖南""","""怀化""","""A_0_24:0.383825,A_25_29:0.1571…","""female:0.594218,male:0.405782""",0.314527,0.861923,0.852267,0.054768,"""t恤:0.026438,上衣:0.023783,亲子装:0.…","""4""","""娱乐""","""娱乐/港台明星""",0.319737,0.353534,0.006392,0.283393,0.1357,0.101556,0.342276,0.137973,0.023426,0.344025,"""3""",0.081744,0.32667,"""3""",0.09596,0.371868,0,6,0
2214642936,465783406,1625509881385,5,0.138112,20,0,0.0,"""OPPOR11s""","""Android""","""广东""","""广州""","""A_0_24:0.353158,A_25_29:0.2370…","""female:0.446355,male:0.553645""",0.365194,0.694381,0.599319,0.133388,"""ps:0.036133,vera:0.046340,vera…","""4""","""时尚""","""时尚/明星时尚""",0.993026,0.960108,0.154406,0.256949,0.14042,0.028536,0.194861,0.096845,0.006064,0.222068,,0.25,0.34075,,0.333333,0.339603,58,18,0
2424557914,466255306,1625492184976,2,0.502604,157,0,0.0,"""PBBM30""","""Android""","""湖南""","""常德""","""A_0_24:0.19411,A_25_29:0.19916…","""female:1.0,male:0.0""",0.241192,0.803522,0.742156,0.100754,"""买彩票:0.034427,人脉:0.020407,人际关系:…","""1""","""星座运势""","""星座运势/风水与算命""",0.319737,0.353534,0.006392,0.283393,0.203812,0.040131,0.248182,0.201543,0.036544,0.251788,"""3""",0.245614,0.209516,"""3""",0.245614,0.209516,13,13,0


##### 序列推荐专用

In [29]:
# 序列推荐模型专用
merge_data = pl.concat([train_data, val_data],how="vertical")
merge_data = (merge_data
              .select(["user_id", "article_id", "expose_time", "expose_pos", "category_level1", "category_level2", "is_clicked"])
              .sort(["user_id","expose_time","expose_pos"])
             )
def build_hist_features_str(group_df: pl.DataFrame) -> pl.DataFrame:

    # 最终的历史序列列
    hist_article_ids = []
    hist_cat1s = []
    hist_cat2s = []
    hist_hourdiffs = []

    # 提前转成列表提高速度
    article_ids = group_df["article_id"].to_list()
    cat1s = group_df["category_level1"].to_list()
    cat2s = group_df["category_level2"].to_list()
    expose_times = group_df["expose_time"].to_list()  # 毫秒时间戳
    clicks = group_df["is_clicked"].to_list()

    # 存储点击记录的历史索引
    clicked_indices = []

    for i in range(len(group_df)):
        now_time = expose_times[i]

        # 回溯点击记录（点击索引必须 < 当前 i）
        hist_click_idx = [idx for idx in clicked_indices if idx < i]
        hist_click_idx = hist_click_idx[-50:]  # 只保留最近的 50 条点击

        hist_aid = [article_ids[idx] for idx in hist_click_idx]
        hist_c1 = [cat1s[idx] for idx in hist_click_idx]
        hist_c2 = [cat2s[idx] for idx in hist_click_idx]
        hist_time = [expose_times[idx] for idx in hist_click_idx]
        hist_diff = [int((now_time - t) / 1000 / 3600) for t in hist_time]

        # 拼接为字符串
        hist_article_ids.append(",".join(map(str, hist_aid)))
        hist_cat1s.append(",".join(map(str, hist_c1)))
        hist_cat2s.append(",".join(map(str, hist_c2)))
        hist_hourdiffs.append(",".join(map(str, hist_diff)))

        # 如果当前记录是点击，则加入点击索引池
        if clicks[i] == 1:
            clicked_indices.append(i)

    # 返回带有新列的 DataFrame
    return group_df.with_columns([
        pl.Series("hist_article_id", hist_article_ids),
        pl.Series("hist_category_level1", hist_cat1s),
        pl.Series("hist_category_level2", hist_cat2s),
        pl.Series("hist_hourdiff", hist_hourdiffs),
    ])

In [31]:
import math

def split_by_user_id_range(merge_data: pl.DataFrame, num_parts: int = 300):
    user_ids = merge_data["user_id"].sort().to_numpy()
    min_uid, max_uid = user_ids[0], user_ids[-1]
    
    # 计算 user_id 的划分边界
    step = math.ceil((max_uid - min_uid + 1) / num_parts)

    # 构造每段的边界 user_id（左闭右开）
    boundaries = [min_uid + i * step for i in range(num_parts + 1)]

    # 找到每个分割点在 merge_data 中的起始行索引
    row_indices = user_ids.searchsorted(boundaries, side="left")
    
    for i in range(num_parts):
        start_idx = row_indices[i]
        end_idx = row_indices[i + 1] if i + 1 < len(row_indices) else len(merge_data)
        part_df = merge_data.slice(start_idx, end_idx - start_idx)

        part_df = (
            part_df
            .group_by("user_id", maintain_order=True)
            .map_groups(build_hist_features_str)
        )
        
        part_df.write_csv(f"/data3/zxh/news_rec/rank_csv_data/hist_train_data/train_csv_{i+1:05}-of-{num_parts:05}.csv", separator="\t")
        print(f"Saved train_csv_{i+1:05}-of-{num_parts:05}.csv in [{start_idx}, {end_idx}), rows = {len(part_df)}")
split_by_user_id_range(merge_data)

Saved train_csv_00001-of-00300.csv in [0, 95768), rows = 95768
Saved train_csv_00002-of-00300.csv in [95768, 210518), rows = 114750
Saved train_csv_00003-of-00300.csv in [210518, 323341), rows = 112823
Saved train_csv_00004-of-00300.csv in [323341, 439196), rows = 115855
Saved train_csv_00005-of-00300.csv in [439196, 526424), rows = 87228
Saved train_csv_00006-of-00300.csv in [526424, 609661), rows = 83237
Saved train_csv_00007-of-00300.csv in [609661, 686307), rows = 76646
Saved train_csv_00008-of-00300.csv in [686307, 781146), rows = 94839
Saved train_csv_00009-of-00300.csv in [781146, 868586), rows = 87440
Saved train_csv_00010-of-00300.csv in [868586, 958337), rows = 89751
Saved train_csv_00011-of-00300.csv in [958337, 1074797), rows = 116460
Saved train_csv_00012-of-00300.csv in [1074797, 1186580), rows = 111783
Saved train_csv_00013-of-00300.csv in [1186580, 1292558), rows = 105978
Saved train_csv_00014-of-00300.csv in [1292558, 1416625), rows = 124067
Saved train_csv_00015-of-00

#### 3.8 合并为 测试数据

In [21]:
test_data = pl.read_ipc("/data3/zxh/news_rec/online_data/test_data_online.ipc")
test_data = test_data.with_columns(
    pl.col("duration_transformed_box").alias("duration"),
    pl.col("refresh_count_transformed_box").alias("refresh_count"),
).drop(["duration_transformed_box", "refresh_count_transformed_box"])
test_data

user_id,article_id,expose_time,network_env,refresh_count,expose_pos,is_clicked,duration
i64,i64,i64,i64,f64,i64,i64,f64
2431381002,466497559,1625560435365,2,0.366121,47,0,0.0
2390152616,466838383,1625584316582,2,0.829975,144,0,0.0
2443013308,466596360,1625552213014,2,0.138112,25,0,0.0
1293444900,466550480,1625547205088,5,0.21485,27,0,0.0
2446511758,465769818,1625546678713,2,0.26756,22,1,0.57374
…,…,…,…,…,…,…,…
2445496462,466025645,1625530203987,2,0.21485,34,0,0.0
1349872856,466714829,1625567748252,2,0.64565,321,0,0.0
2445791558,466446910,1625540321832,5,0.138112,21,0,0.0
2436636882,465965637,1625561846430,5,0.652561,48,0,0.0


In [22]:
test_data = (
    test_data
    .join(user_sparse_feature, on="user_id", how="left")
    .join(user_duration_mean, on="user_id", how="left")
    .join(user_ctr, on="user_id", how="left")
    .join(doc_columns, on="article_id", how="left")
    .join(user_category1_stats, on=["user_id", "category_level1"], how="left")
    .join(user_category2_stats, on=["user_id", "category_level2"], how="left")
)

# 处理统计特征的缺失值（使用中位数填充）
test_data = test_data.with_columns([
    pl.col(col).fill_null(value) for col, value in FILL_VALUES.items() if col in test_data.columns
] + [
    pl.col("publish_time").fill_null(pl.col("publish_time").median())  # 处理 publish_time 单独计算中位数
])

# 处理曝光时间，偏差特征置0，生成转换标签
test_data = test_data.with_columns(
    (abs(test_data["expose_time"] - test_data["publish_time"]) // 3_600_000).cast(pl.Int32).alias("expose_hourdiff"),
    pl.lit(0).alias("expose_pos"), # 偏差特征置0
    (test_data["expose_time"].cast(pl.Datetime("ms")).dt.hour()).alias("expose_hour"),
    (test_data["duration"] >= test_data["docid_history_duration_mean"]).cast(pl.Int8).alias("is_converted"),
).drop(["publish_time"])
test_data.write_csv("/data3/zxh/news_rec/rank_csv_data/test_data/test_data.csv",separator="\t")

In [23]:
# 以 user_id 和 expose_time 进行分组，并统计每个分组的数量
grouped_data = test_data.group_by(["user_id", "expose_time"]).agg(pl.len().alias("count"))

# 过滤出 count >= 5 的数据
filtered_data = grouped_data.filter(pl.col("count") >= 5)

# 输出结果
filtered_data

user_id,expose_time,count
i64,i64,u32
1916651224,1625561093086,6
2427306838,1625542992036,5
1309071476,1625535903589,6
2446574736,1625548218019,5
1480494278,1625579197668,5
…,…,…
2387357936,1625558454144,5
2224969484,1625558495200,5
399069104,1625563677018,6
2342168890,1625555719967,5


In [40]:
# 以 user_id 和 expose_time 进行匹配，筛选出 test_data 中符合 filtered_data 的数据
matched_data = test_data.join(filtered_data, on=["user_id", "expose_time"], how="inner").drop(["count"])

# 输出结果
matched_data = matched_data.sort(["user_id","expose_time"])

# 与输入数据保持一致
matched_data = matched_data.drop(["expose_time"]).fill_null("").with_columns(pl.lit(0.0).alias("duration"))

matched_data.write_csv("/data3/zxh/news_rec/rank_csv_data/test_data/test_data.csv",separator="\t")

In [41]:
matched_data

user_id,article_id,network_env,refresh_count,expose_pos,is_clicked,duration,device_name,os,province,city,age,gender,userid_history_duration_mean,userid_history_count,userid_expose_count,userid_ctr,keywords,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean,userid_category1_history_count,userid_category1_ctr,userid_category1_history_duration_mean,userid_category2_history_count,userid_category2_ctr,userid_category2_history_duration_mean,expose_hourdiff,expose_hour,is_converted
i64,i64,i64,f64,i32,i64,f64,str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,i32,i8,i8
17340,466399808,5,0.26756,0,0,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.290691,0.873913,0.739067,0.284605,"""ev:0.028968,mini:0.034496,mini…","""4""","""汽车""","""汽车/汽车资讯""",0.319737,0.353534,0.006392,0.283393,0.117285,0.018259,0.307542,0.109067,0.005608,0.314201,"""3""",0.414634,0.28521,"""3""",0.391304,0.295621,0,11,0
17340,466126343,5,0.26756,0,1,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.290691,0.873913,0.739067,0.284605,"""世界人口:0.027234,中国^^居民^^膳食^^指南:0…","""4""","""健康""","""健康/养生与保健""",0.319737,0.353534,0.006392,0.283393,0.1361,0.075243,0.3257,0.145456,0.026401,0.317586,"""3""",0.580645,0.280167,"""2""",0.428571,0.215489,0,11,1
17340,466368404,5,0.26756,0,1,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.290691,0.873913,0.739067,0.284605,"""关注点:0.030264,凤凰男:0.041433,出国^^…","""4""","""娱乐""","""娱乐/港台明星""",0.319737,0.353534,0.006392,0.283393,0.1357,0.101556,0.342276,0.137973,0.023426,0.344025,"""3""",0.321429,0.241324,"""2""",0.25,0.210026,0,11,0
17340,466090289,5,0.26756,0,0,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.290691,0.873913,0.739067,0.284605,"""卧室:0.071985,喜欢:0.054731,嘉兴:0.0…","""4""","""生活""","""生活/小窍门""",0.948924,0.8323,0.187393,0.279875,0.120128,0.010926,0.324877,0.103625,0.002201,0.350269,"""3""",0.190476,0.409168,"""""",0.333333,0.339603,0,11,0
17340,466699030,5,0.26756,0,0,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.290691,0.873913,0.739067,0.284605,"""佛说:0.027100,修行:0.022454,南阎浮提^^…","""3""","""文化艺术""","""文化艺术/哲学宗教与民俗神话""",0.319737,0.353534,0.006392,0.283393,0.132088,0.04894,0.365176,0.127007,0.017736,0.346974,"""3""",0.38,0.296914,"""3""",0.342105,0.289733,0,11,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2447273764,466826642,2,0.138112,0,0,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.331481,0.348857,0.036223,"""今日俄罗斯:0.026272,今日俄罗斯^^通讯社:0.01…","""1""","""国际""","""国际/国际事件""",0.319737,0.353534,0.006392,0.283393,0.163092,0.04981,0.341409,0.135408,0.015545,0.334333,"""""",0.25,0.34075,"""""",0.333333,0.339603,0,15,0
2447273764,466823351,2,0.138112,0,0,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.331481,0.348857,0.036223,"""ig:0.057184,健康:0.059997,家里:0.0…","""2""","""娱乐""","""娱乐/港台明星""",0.319737,0.353534,0.006392,0.283393,0.1357,0.101556,0.342276,0.137973,0.023426,0.344025,"""""",0.25,0.34075,"""""",0.333333,0.339603,0,15,0
2447273764,466551898,2,0.138112,0,0,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.331481,0.348857,0.036223,"""中共中央:0.030800,中共中央对外联络部:0.0328…","""1""","""国内""","""国内/高层动态""",0.319737,0.353534,0.006392,0.283393,0.12763,0.03891,0.311547,0.077917,0.002653,0.314663,"""""",0.25,0.34075,"""""",0.333333,0.339603,0,15,0
2447273764,466254142,2,0.138112,0,0,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.331481,0.348857,0.036223,"""90后:0.029803,papi:0.033749,东方不…","""4""","""情感""","""情感/婚姻与家庭""",0.319737,0.353534,0.006392,0.283393,0.157579,0.108771,0.363147,0.15524,0.082245,0.378103,"""""",0.25,0.34075,"""""",0.333333,0.339603,0,15,0


### 4. 数据保存

#### 4.1 保存为csv数据

In [22]:
# 估算每个分块的大小（每 1_000_000 行存储一次）
rows_per_chunk = 1_000_000  

# 计算总行数
total_rows = val_data.height

# 计算分块数量
num_chunks = (total_rows // rows_per_chunk) + (1 if total_rows % rows_per_chunk != 0 else 0)

# 输出目录
output_path = "/data3/zxh/news_rec/rank_csv_data"

# 分块存储为 CSV
for i in range(num_chunks):
    start = i * rows_per_chunk
    end = min((i + 1) * rows_per_chunk, total_rows)
    
    chunk = val_data.slice(start, end - start).fill_null("")
    
    file_path = f"{output_path}/val_data/val_csv_{i+1:05d}-of-{num_chunks:05d}.csv"
    chunk.write_csv(file_path, separator="\t")
    
    print(f"Saved chunk {i+1} to {file_path} ({chunk.height} rows)")

Saved chunk 1 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00001-of-00016.csv (1000000 rows)
Saved chunk 2 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00002-of-00016.csv (1000000 rows)
Saved chunk 3 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00003-of-00016.csv (1000000 rows)
Saved chunk 4 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00004-of-00016.csv (1000000 rows)
Saved chunk 5 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00005-of-00016.csv (1000000 rows)
Saved chunk 6 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00006-of-00016.csv (1000000 rows)
Saved chunk 7 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00007-of-00016.csv (1000000 rows)
Saved chunk 8 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00008-of-00016.csv (1000000 rows)
Saved chunk 9 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00009-of-00016.csv (1000000 rows)
Saved chunk 10 to /data3/zxh/news_rec/rank_csv_data/val_data/val_csv_00010-of-0001

#### 4.2 转换为TFRecord

In [22]:
# def df_to_tfrecord(chunk: pl.DataFrame, file_path: str):
#     """将 Polars DataFrame 分块写入 TFRecord 文件"""
#     def create_example(row: dict):
#         features = {}
#         for col, dtype in chunk.schema.items():
#             value = row[col]
            
#             # 处理空值（根据特征类型设置默认值）
#             if value is None: # value为0是之前设置好的空缺值
#                 if dtype == pl.String:
#                     value = ""
#                 elif dtype in (pl.Int64, pl.Float64):
#                     value = 0
#                 else:
#                     value = dtype.default()
            
#             # 类型转换
#             if dtype == pl.String:
#                 features[col] = tf.train.Feature(
#                     bytes_list=tf.train.BytesList(value=[value.encode()]))
#             elif dtype == pl.Int64:
#                 features[col] = tf.train.Feature(
#                     int64_list=tf.train.Int64List(value=[value]))
#             elif dtype == pl.Float64:
#                 features[col] = tf.train.Feature(
#                     float_list=tf.train.FloatList(value=[value]))
#             else:
#                 raise ValueError(f"Unsupported dtype: {dtype}")

#         return tf.train.Example(features=tf.train.Features(feature=features))

#     # 流式写入（内存优化）
#     with tf.io.TFRecordWriter(file_path) as writer:
#         for row_dict in chunk.iter_rows(named=True):
#             example = create_example(row_dict)
#             writer.write(example.SerializeToString())

In [23]:
# from path import Path
# from concurrent.futures import ThreadPoolExecutor

# def split_to_tfrecords(
#     df: pl.DataFrame,
#     output_dir: str,
#     pattern : str,
#     chunk_size: int = 400_000
# ):
#     """分块保存为 TFRecord 文件"""
#     # 分块处理（内存安全）
#     total_rows = df.height
#     num_chunks = (total_rows // chunk_size) + (1 if total_rows % chunk_size != 0 else 0)
    
#     for i in tqdm(range(num_chunks)):
#         start = i * chunk_size
#         end = min((i + 1) * chunk_size, total_rows)
        
#         # 分块处理（使用指针避免内存复制）
#         chunk = df.slice(start, end - start)
        
#         # 写入文件
#         file_path = f"{output_dir}/{pattern}_tfrecord-{i:05d}-of-{num_chunks:05d}.tfrecord"
#         df_to_tfrecord(chunk, file_path)

In [24]:
# pattern = "train"
# output_dir = f"/data3/zxh/news_rec/TFRecords/{pattern}_data"
# split_to_tfrecords(df=train_data, output_dir=output_dir, pattern=pattern)