### 将所有属性整合起来变成一条条的训练数据

### 1. 导入包

In [1]:
import pickle
import polars as pl
from tqdm import tqdm

In [2]:
# 设置文件目录
offline = False
mode = "offline" if offline else "online"
data_path = f"/data3/zxh/news_rec/{mode}_data"
public_path = "/data3/zxh/news_rec/public_data"

### 2. 处理 item 数据

#### 2.1 keywords数据

In [3]:
doc_keywords = pl.read_ipc(f"{public_path}/doc_keywords_feature.ipc")
doc_keywords.describe()

statistic,article_id,keywords
str,f64,str
"""count""",622407.0,"""622407"""
"""null_count""",0.0,"""0"""
"""mean""",464630000.0,
"""std""",1565300.0,
"""min""",334493096.0,"""#^^0000ff:0.032714,0000ff:0.03…"
"""25%""",463450810.0,
"""50%""",464632474.0,
"""75%""",465821561.0,
"""max""",467278131.0,"""龙泉:1.000000"""


#### 2.2 处理 item 的 sparse 特征

In [4]:
doc_sparse_feature = pl.read_ipc(f"{public_path}/doc_sparse_feature.ipc")
doc_sparse_feature.describe()

statistic,article_id,publish_time,image_count,category_level1,category_level2
str,f64,f64,str,str,str
"""count""",633388.0,633146.0,"""633388""","""633388""","""633388"""
"""null_count""",0.0,242.0,"""0""","""0""","""0"""
"""mean""",464620000.0,1625000000000.0,,,
"""std""",1576900.0,448080000.0,,,
"""min""",325279629.0,1563400000000.0,"""""","""""",""""""
"""25%""",463448490.0,1624700000000.0,,,
"""50%""",464618782.0,1625000000000.0,,,
"""75%""",465814182.0,1625300000000.0,,,
"""max""",467278131.0,1625700000000.0,"""4""","""颜值才艺""","""颜值才艺/男神"""


#### 2.3 处理 item 的 category1 信息

In [5]:
doc_category1_stats = pl.read_ipc(f"{data_path}/doc_category1_stats_{mode}.ipc").drop(["category1_history_count"])
doc_category1_stats.describe()

statistic,category_level1,category1_ctr,category1_popularity,category1_history_duration_mean
str,str,f64,f64,f64
"""count""","""39""",39.0,39.0,39.0
"""null_count""","""0""",0.0,0.0,0.0
"""mean""",,0.130206,0.025641,0.320087
"""std""",,0.030696,0.029621,0.041547
"""min""","""""",0.005495,4.0111e-08,0.176037
"""25%""",,0.115168,0.002049,0.314507
"""50%""",,0.12907,0.015897,0.325458
"""75%""",,0.144295,0.037759,0.34304
"""max""","""颜值才艺""",0.203067,0.10927,0.39428


#### 2.4 处理 item 的 category2 信息

In [6]:
doc_category2_stats = pl.read_ipc(f"{data_path}/doc_category2_stats_{mode}.ipc").drop(["category2_history_count"])
doc_category2_stats

category_level2,category2_ctr,category2_popularity,category2_history_duration_mean
str,f64,f64,f64
"""情感/婚姻与家庭""",0.155137,0.082589,0.378579
"""娱乐/内地明星""",0.135733,0.077224,0.343158
"""军事/军事新闻""",0.171002,0.040722,0.343749
"""健康/疾病防护治疗及西医用药""",0.12753,0.038891,0.329706
"""星座运势/风水与算命""",0.200625,0.036517,0.251325
…,…,…,…
"""汽车/其他""",0.0,0.0,
"""颜值才艺/男神""",0.0,0.0,
"""音乐/日本音乐""",0.0,0.0,
"""搞笑/小品""",0.0,0.0,


#### 2.5 处理doc_ctr

In [7]:
doc_ctr = pl.read_ipc(f"{data_path}/doc_ctr_{mode}.ipc")
doc_ctr = doc_ctr.with_columns(
    pl.col("docid_wilson_ctr").alias("docid_ctr"),
    pl.col("docid_expose_count_transformed_box").alias("docid_expose_count"),
    pl.col("docid_history_count_transformed_box").alias("docid_history_count")
).drop(["docid_wilson_ctr", "docid_expose_count_transformed_box", "docid_history_count_transformed_box"])
doc_ctr.describe()

statistic,article_id,docid_history_count,docid_expose_count,docid_ctr
str,f64,f64,f64,f64
"""count""",553023.0,553023.0,553023.0,553023.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",464300000.0,0.321856,0.368424,0.042885
"""std""",1422000.0,0.350268,0.271691,0.065649
"""min""",325279629.0,0.0,0.0,-0.000457
"""25%""",463295270.0,0.0,0.102186,0.0
"""50%""",464338151.0,0.317807,0.351112,0.006568
"""75%""",465339714.0,0.628504,0.590466,0.065915
"""max""",466561732.0,1.0,1.0,0.640239


#### 2.6 处理doc_duration_mean

In [8]:
doc_duration_mean = pl.read_ipc(f"{data_path}/doc_duration_mean_{mode}.ipc")
doc_duration_mean

article_id,docid_history_duration_mean
i64,f64
465021750,0.43625
465652506,0.398314
465959628,0.268172
464457188,0.204795
463830761,0.300886
…,…
465852922,0.404614
465514869,0.18346
464219800,0.271533
463246528,0.224807


#### 2.7 合并为 doc_columns

In [9]:
doc_columns = (
    doc_keywords.join(doc_sparse_feature, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id")
    )
    .drop("article_id_right")  # 如果不需要，删除临时列
)

doc_columns = (
    doc_columns.join(doc_ctr, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id"),
        pl.col("category_level1").fill_null(""),
        pl.col("category_level2").fill_null("")
    )
    .drop("article_id_right")  # 如果不需要，删除临时列
)

doc_columns = (
    doc_columns.join(doc_duration_mean, on="article_id", how="full")
    # 在 join 过程中直接处理 article_id
    .with_columns(
        pl.coalesce(["article_id", "article_id_right"]).alias("article_id"),
    )
    .drop("article_id_right")  # 删除临时列
)

doc_columns = (
    doc_columns
    .join(doc_category1_stats, on="category_level1", how="left")
    .join(doc_category2_stats, on="category_level2", how="left")
)


doc_columns.describe()

statistic,article_id,keywords,publish_time,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean
str,f64,str,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",633391.0,"""622407""",633146.0,"""633388""","""633391""","""633391""",553023.0,553023.0,553023.0,286647.0,633391.0,633391.0,633391.0,633387.0,633387.0,633333.0
"""null_count""",0.0,"""10984""",245.0,"""3""","""0""","""0""",80368.0,80368.0,80368.0,346744.0,0.0,0.0,0.0,4.0,4.0,58.0
"""mean""",464620000.0,,1625000000000.0,,,,0.321856,0.368424,0.042885,0.276912,0.137945,0.044078,0.324543,0.134831,0.015968,0.319579
"""std""",1576900.0,,448080000.0,,,,0.350268,0.271691,0.065649,0.092061,0.022045,0.033057,0.035649,0.032362,0.020527,0.038677
"""min""",325279629.0,"""#^^0000ff:0.032714,0000ff:0.03…",1563400000000.0,"""""","""""","""""",0.0,0.0,-0.000457,0.029607,0.005495,4.0111e-08,0.176037,0.0,0.0,0.064567
"""25%""",463448492.0,,1624700000000.0,,,,0.0,0.102186,0.0,0.216348,0.124069,0.018194,0.314507,0.11821,0.003911,0.30392
"""50%""",464618782.0,,1625000000000.0,,,,0.317807,0.351112,0.006568,0.283309,0.132281,0.036681,0.329983,0.128979,0.007396,0.32554
"""75%""",465814182.0,,1625300000000.0,,,,0.628504,0.590466,0.065915,0.338787,0.144295,0.072972,0.34304,0.148184,0.023876,0.341117
"""max""",467278131.0,"""龙泉:1.000000""",1625700000000.0,"""4""","""颜值才艺""","""颜值才艺/男神""",1.0,1.0,0.640239,0.959917,0.203067,0.10927,0.39428,0.4,0.082589,0.43561


In [10]:
doc_columns

article_id,keywords,publish_time,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean
i64,str,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
349635709,"""上班族:0.052498,买车:0.050440,二手车:0…",1572519971000,"""4""","""汽车""","""汽车/用车""",0.0,0.0,0.0,,0.113181,0.018194,0.306948,0.114335,0.005002,0.325907
361653323,"""医生:0.133734,吸烟:0.149266,板蓝根:0.…",1624522285000,"""1""","""健康""","""健康/疾病防护治疗及西医用药""",0.0,0.541653,0.0,,0.135202,0.07411,0.325458,0.12753,0.038891,0.329706
426732705,"""155n:0.033340,polo:0.029521,中控…",1610808303000,"""4""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.113181,0.018194,0.306948,0.126563,0.007396,0.288905
430221183,"""etc:0.038040,代表:0.028015,内饰:0.…",1612581556000,"""2""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.113181,0.018194,0.306948,0.126563,0.007396,0.288905
441756326,"""丰田凯美瑞:0.089051,充电器:0.058525,品牌…",1618825835000,"""4""","""汽车""","""汽车/买车""",0.0,0.0,0.0,,0.113181,0.018194,0.306948,0.126563,0.007396,0.288905
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
467278115,"""cj:0.048329,三巨头:0.027688,争冠:0.…",1625672111000,"""4""","""体育""","""体育/NBA""",,,,,0.172512,0.086567,0.329983,0.201662,0.024101,0.328555
467278124,"""t恤:0.031757,主理人:0.025905,优雅:0.…",1625672116000,"""4""","""时尚""","""时尚/明星时尚""",,,,,0.139704,0.028657,0.195537,0.09578,0.006248,0.223837
463642111,,,,"""""","""""",0.589004,0.50169,0.064033,0.200082,0.102677,0.000023,0.320475,0.102659,0.000023,0.320475
465493907,,,,"""""","""""",0.0,0.254568,-3.1392e-17,,0.102677,0.000023,0.320475,0.102659,0.000023,0.320475


### 3. 处理 user 数据

#### 3.1 处理 user_sparse

In [11]:
user_sparse_feature = pl.read_ipc(f"{public_path}/user_sparse_feature.ipc")

#### 3.2 处理 user_duration_mean

In [12]:
user_duration_mean = pl.read_ipc(f"{data_path}/user_duration_mean_{mode}.ipc")
user_duration_mean

user_id,userid_history_duration_mean
i64,f64
236837432,0.157165
2300599814,0.305542
2440130324,0.382317
2411960594,0.189772
2438249066,0.371854
…,…
89202638,0.260039
2446103892,0.427664
2364653974,0.269753
2392244202,0.332938


#### 3.3 处理 user_ctr

In [13]:
user_ctr = pl.read_ipc(f"{data_path}/user_ctr_{mode}.ipc")
user_ctr = user_ctr.with_columns(
    pl.col("userid_wilson_ctr").alias("userid_ctr"),
    pl.col("userid_expose_count_transformed_box").alias("userid_expose_count"),
    pl.col("userid_history_count_transformed_box").alias("userid_history_count")
).drop(["userid_wilson_ctr", "userid_expose_count_transformed_box", "userid_history_count_transformed_box"])
user_ctr.describe()

statistic,user_id,userid_history_count,userid_expose_count,userid_ctr
str,f64,f64,f64,f64
"""count""",1424464.0,1424464.0,1424464.0,1424464.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",1979700000.0,0.34189,0.354483,0.07321
"""std""",479900000.0,0.315219,0.264563,0.096586
"""min""",17340.0,0.0,0.0,-3.1392e-17
"""25%""",1564200000.0,0.0,0.081458,0.0
"""50%""",2215300000.0,0.339728,0.346099,0.033975
"""75%""",2410200000.0,0.630756,0.583428,0.114205
"""max""",2447200000.0,1.0,1.0,0.876763


In [14]:
user_ctr

user_id,userid_history_count,userid_expose_count,userid_ctr
i64,f64,f64,f64
1463134918,0.339728,0.136067,0.207655
2342925520,0.0,0.0,0.0
1497405766,0.409151,0.257565,0.158217
1639862502,0.0,0.0,0.0
2442848920,0.339728,0.176692,0.150036
…,…,…,…
2216221172,0.849348,0.814235,0.074514
2443589648,0.229235,0.081458,0.094529
1299566312,0.751566,0.579429,0.291845
2389767448,0.630756,0.429679,0.314271


#### 3.4 合并为 训练数据

In [15]:
# 训练集上得到的数据，所有缺失值的填充要按照这个字典来
FILL_VALUES = {
    'docid_history_count': 0.3197370642850515,
    'docid_expose_count': 0.35353441318778134,
    'docid_ctr': 0.006392260064240436,
    'docid_history_duration_mean': 0.28339285217401183,
    'category1_ctr': 0.13570003591442992,
    'category1_popularity': 0.036799390379834,
    'category1_history_duration_mean': 0.32981757714259613,
    'category2_ctr': 0.13293362050565613,
    'category2_popularity': 0.007456100573611463,
    'category2_history_duration_mean': 0.32639158456815426,
    'userid_category1_ctr': 0.25,
    'userid_category1_history_duration_mean': 0.34075001249075343,
    'userid_category2_ctr': 0.3333333333333333,
    'userid_category2_history_duration_mean': 0.3396034216144695,
    'userid_history_duration_mean': 0.3472170022795658,
    'userid_history_count': 0.33148076049294156,
    'userid_expose_count': 0.3488569631166438,
    'userid_ctr': 0.036223160969787456
 }

In [16]:
train_data = pl.read_ipc(f"{data_path}/train_data_{mode}.ipc")
# 召回过程，选取所有正样本，负样本考随机采样得到
# train_data = train_data.filter(pl.col("is_clicked") == 1)
train_data = train_data.with_columns(
    pl.col("refresh_count_transformed_box").alias("refresh_count")
).drop(["duration_transformed_box", "refresh_count_transformed_box", "expose_pos", "duration", "is_clicked"])

train_data = (
    train_data
    .join(user_sparse_feature, on="user_id", how="left")
    .join(user_duration_mean, on="user_id", how="left")
    .join(user_ctr, on="user_id", how="left")
    .join(doc_columns, on="article_id", how="left")
)

# 处理出版时间 和 统计特征的缺失值（使用中位数填充）
train_data = train_data.with_columns([
    pl.col(col).fill_null(value) for col, value in FILL_VALUES.items() if col in train_data.columns
] + [
    pl.col("publish_time").fill_null(pl.col("publish_time").median())  # 处理 publish_time 单独计算中位数
])

train_data = train_data.sort(["user_id","expose_time","article_id"])

# 计算小时级差距（毫秒差 / 3600000）
train_data = train_data.with_columns(
    (train_data["expose_time"].cast(pl.Datetime("ms")).dt.hour()).alias("expose_hour")
).drop(["expose_time", "publish_time"])
train_data

user_id,article_id,network_env,refresh_count,device_name,os,province,city,age,gender,userid_history_duration_mean,userid_history_count,userid_expose_count,userid_ctr,keywords,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean,expose_hour
i64,i64,i64,f64,str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8
17340,462077126,2,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.292731,0.883892,0.749994,0.293541,"""acg:0.021879,konami:0.027251,s…","""4""","""动漫""","""动漫/日韩动漫""",0.946722,0.863891,0.094437,0.316013,0.109581,0.001435,0.301214,0.107453,0.000563,0.293774,12
17340,462243474,2,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.292731,0.883892,0.749994,0.293541,"""女朋友:0.049049,孙一宁:0.070881,孙悟空:…","""4""","""搞笑""","""搞笑/段子""",0.974402,0.913843,0.105119,0.304359,0.16268,0.023724,0.330949,0.16556,0.017759,0.336461,12
17340,462317087,2,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.292731,0.883892,0.749994,0.293541,"""app:0.024177,买房:0.025664,二手房:0…","""3""","""房产""","""房产/买房卖房""",0.970511,0.900335,0.118149,0.360532,0.12221,0.012471,0.338778,0.105947,0.004063,0.341279,12
17340,462821612,2,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.292731,0.883892,0.749994,0.293541,"""中老年:0.022587,人生:0.022596,体重:0.…","""4""","""搞笑""","""搞笑/囧事""",0.990512,0.950894,0.14834,0.283659,0.16268,0.023724,0.330949,0.154487,0.00543,0.312192,12
17340,462907578,2,0.0,"""iPhoneX""","""IOS""","""上海""","""上海""","""A_0_24:0.029774,A_25_29:0.1268…","""female:0.0,male:1.0""",0.292731,0.883892,0.749994,0.293541,"""tk:0.105316,wifi:0.035106,中附上:…","""3""","""科学""","""科学/天文与航天""",0.986152,0.936847,0.140306,0.36779,0.113797,0.030434,0.34835,0.107894,0.015224,0.344713,12
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2447175246,466191290,5,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.0,0.0,0.0,"""中华人民共和国:0.155598,公告:0.181968,国…","""1""","""科技""","""科技/互联网""",0.984525,0.99266,0.01037,0.158308,0.131786,0.022491,0.315408,0.093028,0.004623,0.31744,23
2447175252,465799811,5,0.138112,"""""","""""","""""","""""","""""","""""",0.347217,0.0,0.0,0.0,"""乡村:0.027052,低保户:0.025733,儿童:0.…","""4""","""农村""","""农村/农业资讯""",0.989417,0.952125,0.121711,0.33293,0.123118,0.015897,0.349712,0.125165,0.013018,0.342013,23
2447175308,466191290,5,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.0,0.081458,0.0,"""中华人民共和国:0.155598,公告:0.181968,国…","""1""","""科技""","""科技/互联网""",0.984525,0.99266,0.01037,0.158308,0.131786,0.022491,0.315408,0.093028,0.004623,0.31744,23
2447175308,466319737,5,0.0,"""""","""""","""""","""""","""""","""""",0.347217,0.0,0.081458,0.0,"""app:0.031235,ceo:0.029697,ceo^…","""3""","""汽车""","""汽车/汽车资讯""",0.941867,0.946665,0.008819,0.187705,0.113181,0.018194,0.306948,0.098967,0.005796,0.313608,23


#### 3.5 合并为 测试数据

In [17]:
test_data = pl.read_ipc("/data3/zxh/news_rec/online_data/test_data_online.ipc")
# 召回过程，选取所有正样本，负样本考随机采样得到
# test_data = test_data.filter(pl.col("is_clicked") == 1)
test_data = test_data.with_columns(
    pl.col("refresh_count_transformed_box").alias("refresh_count")
).drop(["duration_transformed_box", "refresh_count_transformed_box", "expose_pos", "duration"])

test_data = (
    test_data
    .join(user_sparse_feature, on="user_id", how="left")
    .join(user_duration_mean, on="user_id", how="left")
    .join(user_ctr, on="user_id", how="left")
    .join(doc_columns, on="article_id", how="left")
)

# 处理出版时间 和 统计特征的缺失值（使用中位数填充）
test_data = test_data.with_columns([
    pl.col(col).fill_null(value) for col, value in FILL_VALUES.items() if col in test_data.columns
] + [
    pl.col("publish_time").fill_null(pl.col("publish_time").median())  # 处理 publish_time 单独计算中位数
])

# 计算小时级差距（毫秒差 / 3600000）
test_data = test_data.with_columns(
    (test_data["expose_time"].cast(pl.Datetime("ms")).dt.hour()).alias("expose_hour")
)

(test_data.select(["user_id", "article_id", "expose_hour", "is_clicked"])
          .unique()
          .write_csv("/data3/zxh/news_rec/recall_csv_data/test_data/test_data.csv",separator="\t"))

In [18]:
test_data

user_id,article_id,expose_time,network_env,refresh_count,is_clicked,device_name,os,province,city,age,gender,userid_history_duration_mean,userid_history_count,userid_expose_count,userid_ctr,keywords,publish_time,image_count,category_level1,category_level2,docid_history_count,docid_expose_count,docid_ctr,docid_history_duration_mean,category1_ctr,category1_popularity,category1_history_duration_mean,category2_ctr,category2_popularity,category2_history_duration_mean,expose_hour
i64,i64,i64,i64,f64,i64,str,str,str,str,str,str,f64,f64,f64,f64,str,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8
2431381002,466497559,1625560435365,2,0.366121,0,"""M2007J22C""","""Android""","""江西""","""九江""","""A_0_24:0.084074,A_25_29:0.0495…","""female:0.276102,male:0.723898""",0.316812,0.8509,0.774979,0.127071,"""人生:0.035545,劈腿:0.036345,包间:0.0…",1.6255e12,"""3""","""情感""","""情感/婚姻与家庭""",0.683315,0.605048,0.059709,0.384525,0.157467,0.10927,0.36357,0.155137,0.082589,0.378579,8
2390152616,466838383,1625584316582,2,0.829975,0,"""V1965A""","""Android""","""辽宁""","""葫芦岛""","""A_0_24:0.066269,A_25_29:0.0717…","""female:0.273217,male:0.726783""",0.238623,0.906467,0.878375,0.076488,"""4s店:0.023849,事故:0.029743,二手车:0…",1.6256e12,"""4""","""汽车""","""汽车/用车""",0.319737,0.353534,0.006392,0.283393,0.113181,0.018194,0.306948,0.114335,0.005002,0.325907,15
2443013308,466596360,1625552213014,2,0.138112,0,"""LIO-AN00""","""Android""","""广东""","""东莞""","""A_0_24:0.09405,A_25_29:0.17353…","""female:0.629811,male:0.370189""",0.268177,0.912701,0.796113,0.280978,"""低胸:0.032807,低胸^^黑裙:0.026965,发型…",1.6255e12,"""4""","""时尚""","""时尚/女性时尚""",0.319737,0.353534,0.006392,0.283393,0.139704,0.028657,0.195537,0.11821,0.010193,0.232941,6
1293444900,466550480,1625547205088,5,0.21485,0,"""MHA-AL00""","""Android""","""海南""","""三亚""","""A_0_24:0.020534,A_25_29:0.1824…","""female:0.0,male:1.0""",0.237377,0.914906,0.811186,0.240239,"""naval:0.056106,naval^^news:0.0…",1.6255e12,"""4""","""军事""","""军事/武器""",0.734693,0.525696,0.192303,0.479603,0.166156,0.056497,0.348445,0.160845,0.011973,0.347263,4
2446511758,465769818,1625546678713,2,0.26756,1,"""""","""""","""""","""""","""A_0_24:0.282267,A_25_29:0.2958…","""female:0.561326,male:0.438674""",0.490715,0.806152,0.667961,0.23322,"""亲情:0.018269,储君:0.027646,刘应:0.0…",1.6253e12,"""4""","""历史""","""历史/中国史""",0.922125,0.847849,0.063148,0.382596,0.12907,0.036681,0.39428,0.125564,0.030836,0.400018,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2445496462,466025645,1625530203987,2,0.21485,0,"""Mi9Pro5G""","""Android""","""广东""","""广东""","""A_0_24:0.203706,A_25_29:0.2327…","""female:0.0,male:1.0""",0.225388,0.618423,0.586005,0.072089,"""人力^^资源:0.023107,人力^^资源委员会:0.02…",1.6254e12,"""3""","""财经""","""财经/财经人物""",0.95305,0.861849,0.124244,0.35658,0.128966,0.024375,0.334888,0.135725,0.004793,0.352719,0
1349872856,466714829,1625567748252,2,0.64565,0,"""PACM00""","""Android""","""山东""","""济宁""","""A_0_24:0.088099,A_25_29:0.5715…","""female:0.0,male:1.0""",0.355585,0.458394,0.706604,0.005433,"""乳汁:0.028392,二手烟:0.023179,产后:0.…",1.6255e12,"""4""","""育儿""","""育儿/孕产健康护理""",0.319737,0.353534,0.006392,0.283393,0.127939,0.018939,0.320226,0.127238,0.007923,0.328841,10
2445791558,466446910,1625540321832,5,0.138112,0,"""V1831A""","""Android""","""浙江""","""杭州""","""A_0_24:0.508554,A_25_29:0.2355…","""female:0.252032,male:0.747968""",0.347217,0.0,0.793926,0.0,"""低胸:0.041147,健美:0.028697,克洛伊:0.…",1.6255e12,"""4""","""娱乐""","""娱乐/欧美明星""",0.851319,0.741812,0.086773,0.268752,0.135728,0.102408,0.34304,0.086309,0.000634,0.328564,2
2436636882,465965637,1625561846430,5,0.652561,0,"""BLA-AL00""","""Android""","""辽宁""","""大连""","""A_0_24:0.263469,A_25_29:0.1192…","""female:0.721427,male:0.278573""",0.263959,0.875095,0.881994,0.041375,"""久坐:0.021539,乳清蛋白:0.026122,低脂:0…",1.6254e12,"""4""","""时尚""","""时尚/女性时尚""",0.935618,0.842028,0.101558,0.317,0.139704,0.028657,0.195537,0.11821,0.010193,0.232941,8


### 4. 数据保存

#### 4.1 保存为csv数据

In [None]:
# 估算每个分块的大小（每 1_000_000 行存储一次）
rows_per_chunk = 1_000_000  

# 计算总行数
total_rows = test_data.height

# 计算分块数量
num_chunks = (total_rows // rows_per_chunk) + (1 if total_rows % rows_per_chunk != 0 else 0)

# 输出目录
output_path = "/data3/zxh/news_rec/recall_csv_data"

# 分块存储为 CSV
for i in range(num_chunks):
    start = i * rows_per_chunk
    end = min((i + 1) * rows_per_chunk, total_rows)
    
    chunk = test_data.slice(start, end - start).fill_null("")
    
    file_path = f"{output_path}/test_data/test_csv_{i+1:05d}-of-{num_chunks:05d}.csv"
    chunk.write_csv(file_path, separator="\t")
    
    print(f"Saved chunk {i+1} to {file_path} ({chunk.height} rows)")

Saved chunk 1 to /data3/zxh/news_rec/recall_csv_data/val_data/val_csv_00000-of-00003.csv (1000000 rows)
Saved chunk 2 to /data3/zxh/news_rec/recall_csv_data/val_data/val_csv_00001-of-00003.csv (1000000 rows)
Saved chunk 3 to /data3/zxh/news_rec/recall_csv_data/val_data/val_csv_00002-of-00003.csv (183545 rows)


#### 4.2 保存热度字典

In [18]:
# 计算 value_counts
article_counts = train_data["article_id"].value_counts()

# 将 article_id 转换为 string 并存为 dict
article_count_dict = dict(zip(article_counts["article_id"].cast(pl.Utf8), article_counts["count"]))

# 保存为 pkl 文件
with open(f"/data3/zxh/news_rec/public_data/article_count_{mode}_dict.pkl", "wb") as f:
    pickle.dump(article_count_dict, f)

print("✅ 字典已成功保存为 article_count_dict.pkl")

✅ 字典已成功保存为 article_count_dict.pkl


#### 4.3 保存用户特征和物品特征

In [51]:
# 近期所有的曝光物料如下
train_item_features = train_data.select(["article_id","keywords","image_count","category_level1","category_level2",
                                        "docid_history_count","docid_expose_count","docid_ctr","docid_history_duration_mean",
                                        "category1_ctr","category1_popularity","category1_history_duration_mean",
                                        "category2_ctr","category2_popularity","category2_history_duration_mean"]).unique()

: 

In [36]:
train_item_features.write_csv("/data3/zxh/news_rec/recall_csv_data/test_data/train_item_features.csv",separator="\t") 

In [49]:
test_item_features = test_data.select(["article_id","keywords","image_count","category_level1","category_level2",
                                        "docid_history_count","docid_expose_count","docid_ctr","docid_history_duration_mean",
                                        "category1_ctr","category1_popularity","category1_history_duration_mean",
                                        "category2_ctr","category2_popularity","category2_history_duration_mean"]).unique()
test_item_features.write_csv("/data3/zxh/news_rec/recall_csv_data/test_data/test_item_features.csv",separator="\t") 

In [19]:
test_user_features = test_data.select(["user_id","network_env","refresh_count","device_name","os","province",
                                       "city","age","gender","userid_history_duration_mean","userid_history_count",
                                       "userid_expose_count","userid_ctr","expose_hour"]).unique()
test_user_features.write_csv("/data3/zxh/news_rec/recall_csv_data/test_data/test_user_features.csv",separator="\t") 

In [22]:
# 假设 test_user_features 是一个 pl.DataFrame
group_count = test_user_features.group_by(["user_id", "expose_hour"]).count().height

print(f"一共有 {group_count} 个 (user_id, expose_hour) 分组")

一共有 910176 个 (user_id, expose_hour) 分组


  group_count = test_user_features.group_by(["user_id", "expose_hour"]).count().height
