### 处理用户离散特征（年龄和性别）

#### 1. 读取数据

In [1]:
import polars as pl
data_path = "/data3/zxh/news_rec/raw_data"
user_info = pl.read_ipc(f"{data_path}/user_info.arrow")
user_info

user_id,device_name,os,province,city,age,gender
i64,str,str,str,str,str,str
1000372820,"""TAS-AN00""","""Android""","""广东""","""广州""","""A_0_24:0.404616,A_25_29:0.0590…","""female:0.051339,male:0.948661"""
1000652892,"""PACM00""","""Android""","""河北""","""唐山""","""A_0_24:0.615458,A_25_29:0.0862…","""female:0.280295,male:0.719705"""
1000908852,"""MI6X""","""Android""","""上海""","""上海""","""A_0_24:0.123255,A_25_29:0.2082…","""female:0.000000,male:1.000000"""
1001168798,"""iPhone11""","""IOS""",,,"""A_0_24:0.436296,A_25_29:0.4893…","""female:0.870710,male:0.129290"""
1001305614,"""M2103K19C""","""Android""","""江苏""","""苏州""","""A_0_24:0.006632,A_25_29:0.0434…","""female:0.000000,male:1.000000"""
…,…,…,…,…,…,…
999798184,"""DIG-AL00""","""Android""","""四川""","""成都""","""A_0_24:0.375362,A_25_29:0.1892…","""female:0.950213,male:0.049787"""
999813672,"""CoolpadA8-930""","""Android""","""广东""","""广州""","""A_0_24:0.225201,A_25_29:0.1697…","""female:0.298108,male:0.701892"""
999859618,"""KNT-AL10""","""Android""","""河南""","""南阳""","""A_0_24:0.838753,A_25_29:0.0419…","""female:0.950213,male:0.049787"""
999873694,"""OPPOR9sPlus""","""Android""","""山东""","""威海""","""A_0_24:0.093117,A_25_29:0.2800…","""female:0.103819,male:0.896181"""


#### 2. 处理年龄

In [2]:
# 计算所有 age 字符串的长度
age_lengths = user_info["age"].str.len_chars()
age_lengths.unique()

age
u32
""
64.0
259.0


In [3]:
# 查看 age 字符串的长度为259的那些行，可以发现只是重复了，但是不会影响后续正则表达式匹配
for age in user_info.filter(age_lengths == 259).select(["age"]).to_series().to_list():
    print(age)

A_0_24:0.256968,A_0_24:0.256968,A_0_24:0.256968,A_0_24:0.256968,A_25_29:0.129631,A_25_29:0.129631,A_25_29:0.129631,A_25_29:0.129631,A_30_39:0.270219,A_30_39:0.270219,A_30_39:0.270219,A_30_39:0.270219,A_40+:0.343182,A_40+:0.343182,A_40+:0.343182,A_40+:0.343182
A_0_24:0.129381,A_0_24:0.129381,A_0_24:0.129381,A_0_24:0.129381,A_25_29:0.234456,A_25_29:0.234456,A_25_29:0.234456,A_25_29:0.234456,A_30_39:0.314669,A_30_39:0.314669,A_30_39:0.314669,A_30_39:0.314669,A_40+:0.321493,A_40+:0.321493,A_40+:0.321493,A_40+:0.321493
A_0_24:0.131260,A_0_24:0.131260,A_0_24:0.131260,A_0_24:0.131260,A_25_29:0.290373,A_25_29:0.290373,A_25_29:0.290373,A_25_29:0.290373,A_30_39:0.357634,A_30_39:0.357634,A_30_39:0.357634,A_30_39:0.357634,A_40+:0.220733,A_40+:0.220733,A_40+:0.220733,A_40+:0.220733
A_0_24:0.242107,A_0_24:0.242107,A_0_24:0.242107,A_0_24:0.242107,A_25_29:0.300750,A_25_29:0.300750,A_25_29:0.300750,A_25_29:0.300750,A_30_39:0.288349,A_30_39:0.288349,A_30_39:0.288349,A_30_39:0.288349,A_40+:0.168794,A_40+

In [4]:
# 使用正则表达式提取用户的性别
age_patterns = {
    "age_0_24": r"A_0_24:([\d.]+)",
    "age_25_29": r"A_25_29:([\d.]+)",
    "age_30_39": r"A_30_39:([\d.]+)", 
    "age_40+": r"A_40\+:([\d.]+)"  # 注意转义特殊符号+
}

# 构造表达式列表
exprs = [
    pl.col("age")
    .str.extract(pattern, group_index=1)
    .cast(pl.Float32)
    .fill_null(0.0)
    .alias(col_name)
    for col_name, pattern in age_patterns.items()
]

# 应用转换并添加新列
user_info = user_info.with_columns(exprs)

# 验证结果
user_info[["age", *age_patterns.keys()]].head()

age,age_0_24,age_25_29,age_30_39,age_40+
str,f32,f32,f32,f32
"""A_0_24:0.404616,A_25_29:0.0590…",0.404616,0.059027,0.51655,0.019806
"""A_0_24:0.615458,A_25_29:0.0862…",0.615458,0.086233,0.141408,0.156901
"""A_0_24:0.123255,A_25_29:0.2082…",0.123255,0.208225,0.298089,0.370431
"""A_0_24:0.436296,A_25_29:0.4893…",0.436296,0.48937,0.06156,0.012773
"""A_0_24:0.006632,A_25_29:0.0434…",0.006632,0.043408,0.350842,0.599118


In [5]:
user_info.describe()

statistic,user_id,device_name,os,province,city,age,gender,age_0_24,age_25_29,age_30_39,age_40+
str,f64,str,str,str,str,str,str,f64,f64,f64,f64
"""count""",1538384.0,"""1470640""","""1470576""","""1446124""","""1442312""","""1478493""","""1479812""",1538384.0,1538384.0,1538384.0,1538384.0
"""null_count""",0.0,"""67744""","""67808""","""92260""","""96072""","""59891""","""58572""",0.0,0.0,0.0,0.0
"""mean""",1983900000.0,,,,,,,0.287505,0.17609,0.213863,0.28361
"""std""",476760000.0,,,,,,,0.236477,0.126931,0.158668,0.238479
"""min""",17340.0,""".N360.""","""Android""","""Abruzzo""","""Aachen""","""A_0_24:0.000000,A_25_29:0.0000…","""female:0.000000,male:1.000000""",0.0,0.0,0.0,0.0
"""25%""",1567900000.0,,,,,,,0.088865,0.0862,0.10538,0.085419
"""50%""",2215400000.0,,,,,,,0.233304,0.156148,0.180267,0.223409
"""75%""",2411400000.0,,,,,,,0.443468,0.237178,0.285814,0.432363
"""max""",2447400000.0,"""xiaomi6""","""IOS""","""黑龙江""","""龙岩""","""A_0_24:1.000000,A_25_29:0.0000…","""male:1.000000""",1.0,0.994316,1.0,1.0


In [6]:
user_info = user_info.with_columns(
    pl.when(pl.col("age").is_not_null())
    .then(
        pl.concat_str([
                pl.format("A_0_24:{}", pl.col("age_0_24")),
                pl.format("A_25_29:{}", pl.col("age_25_29")),
                pl.format("A_30_39:{}", pl.col("age_30_39")),
                pl.format("A_40+:{}", pl.col("age_40+"))
            ],
            separator=","
        )).alias("age")
).drop(["age_0_24","age_25_29","age_30_39","age_40+"])
user_info

user_id,device_name,os,province,city,age,gender
i64,str,str,str,str,str,str
1000372820,"""TAS-AN00""","""Android""","""广东""","""广州""","""A_0_24:0.404616,A_25_29:0.0590…","""female:0.051339,male:0.948661"""
1000652892,"""PACM00""","""Android""","""河北""","""唐山""","""A_0_24:0.615458,A_25_29:0.0862…","""female:0.280295,male:0.719705"""
1000908852,"""MI6X""","""Android""","""上海""","""上海""","""A_0_24:0.123255,A_25_29:0.2082…","""female:0.000000,male:1.000000"""
1001168798,"""iPhone11""","""IOS""",,,"""A_0_24:0.436296,A_25_29:0.4893…","""female:0.870710,male:0.129290"""
1001305614,"""M2103K19C""","""Android""","""江苏""","""苏州""","""A_0_24:0.006632,A_25_29:0.0434…","""female:0.000000,male:1.000000"""
…,…,…,…,…,…,…
999798184,"""DIG-AL00""","""Android""","""四川""","""成都""","""A_0_24:0.375362,A_25_29:0.1892…","""female:0.950213,male:0.049787"""
999813672,"""CoolpadA8-930""","""Android""","""广东""","""广州""","""A_0_24:0.225201,A_25_29:0.1697…","""female:0.298108,male:0.701892"""
999859618,"""KNT-AL10""","""Android""","""河南""","""南阳""","""A_0_24:0.838753,A_25_29:0.0419…","""female:0.950213,male:0.049787"""
999873694,"""OPPOR9sPlus""","""Android""","""山东""","""威海""","""A_0_24:0.093117,A_25_29:0.2800…","""female:0.103819,male:0.896181"""


#### 3. 处理性别

In [7]:
# 计算所有 gender 字符串的长度
gender_lengths = user_info["gender"].str.len_chars()
gender_lengths.unique()

gender
u32
""
13.0
15.0
29.0
59.0


In [8]:
# 查看这些异常的长度都是什么情况，可以发现不影响正则表达式匹配
print(user_info.filter(gender_lengths == 13).select(["gender"]).to_series().to_list()[0]) # 缺失了female字段
print(user_info.filter(gender_lengths == 15).select(["gender"]).to_series().to_list()[0]) # 缺失了male字段
print(user_info.filter(gender_lengths == 59).select(["gender"]).to_series().to_list()[0]) # 重复出现

male:1.000000
female:1.000000
female:0.852616,female:0.852616,male:0.147384,male:0.147384


In [9]:
gender_patterns = {
    "gender_female": r"female:([\d.]+)",  # 正确匹配female
    "gender_male": r"\bmale:([\d.]+)"     # 添加单词边界符\b防止错误匹配
}

# 重新生成转换表达式
exprs = [
    pl.col("gender")
    .str.extract(pattern, group_index=1)
    .cast(pl.Float32)
    .fill_null(0.0)
    .alias(col_name)
    for col_name, pattern in gender_patterns.items()
]

# 重新执行转换
user_info = user_info.with_columns(exprs)

# 验证结果
print(user_info[["gender", "gender_female", "gender_male"]])

shape: (1_538_384, 3)
┌───────────────────────────────┬───────────────┬─────────────┐
│ gender                        ┆ gender_female ┆ gender_male │
│ ---                           ┆ ---           ┆ ---         │
│ str                           ┆ f32           ┆ f32         │
╞═══════════════════════════════╪═══════════════╪═════════════╡
│ female:0.051339,male:0.948661 ┆ 0.051339      ┆ 0.948661    │
│ female:0.280295,male:0.719705 ┆ 0.280295      ┆ 0.719705    │
│ female:0.000000,male:1.000000 ┆ 0.0           ┆ 1.0         │
│ female:0.870710,male:0.129290 ┆ 0.87071       ┆ 0.12929     │
│ female:0.000000,male:1.000000 ┆ 0.0           ┆ 1.0         │
│ …                             ┆ …             ┆ …           │
│ female:0.950213,male:0.049787 ┆ 0.950213      ┆ 0.049787    │
│ female:0.298108,male:0.701892 ┆ 0.298108      ┆ 0.701892    │
│ female:0.950213,male:0.049787 ┆ 0.950213      ┆ 0.049787    │
│ female:0.103819,male:0.896181 ┆ 0.103819      ┆ 0.896181    │
│ female:0.302557,

In [10]:
# 转换回kv键值对处理
user_info = user_info.with_columns(
    # 处理 gender 字段
    pl.when(pl.col("gender").is_not_null())  # 改为浮点数比较
    .then(
        pl.concat_str(
            [
                pl.format("female:{}", pl.col("gender_female")),
                pl.format("male:{}", pl.col("gender_male"))
            ],
            separator=","
        )
    ).alias("gender")
).drop(["gender_female","gender_male"])
user_info

user_id,device_name,os,province,city,age,gender
i64,str,str,str,str,str,str
1000372820,"""TAS-AN00""","""Android""","""广东""","""广州""","""A_0_24:0.404616,A_25_29:0.0590…","""female:0.051339,male:0.948661"""
1000652892,"""PACM00""","""Android""","""河北""","""唐山""","""A_0_24:0.615458,A_25_29:0.0862…","""female:0.280295,male:0.719705"""
1000908852,"""MI6X""","""Android""","""上海""","""上海""","""A_0_24:0.123255,A_25_29:0.2082…","""female:0.0,male:1.0"""
1001168798,"""iPhone11""","""IOS""",,,"""A_0_24:0.436296,A_25_29:0.4893…","""female:0.87071,male:0.12929"""
1001305614,"""M2103K19C""","""Android""","""江苏""","""苏州""","""A_0_24:0.006632,A_25_29:0.0434…","""female:0.0,male:1.0"""
…,…,…,…,…,…,…
999798184,"""DIG-AL00""","""Android""","""四川""","""成都""","""A_0_24:0.375362,A_25_29:0.1892…","""female:0.950213,male:0.049787"""
999813672,"""CoolpadA8-930""","""Android""","""广东""","""广州""","""A_0_24:0.225201,A_25_29:0.1697…","""female:0.298108,male:0.701892"""
999859618,"""KNT-AL10""","""Android""","""河南""","""南阳""","""A_0_24:0.838753,A_25_29:0.0419…","""female:0.950213,male:0.049787"""
999873694,"""OPPOR9sPlus""","""Android""","""山东""","""威海""","""A_0_24:0.093117,A_25_29:0.2800…","""female:0.103819,male:0.896181"""


#### 4. 保存数据

In [11]:
# 填充空缺值为 ""，表示未知的那一类
user_info = user_info.fill_null("")

# 保存数据
public_path = "/data3/zxh/news_rec/public_data" 
user_info.write_ipc(f"{public_path}/user_sparse_feature.ipc")