# 零售客群分析

# Part 1. 编码值替换

In [24]:
import pandas as pd

# 读取KYC标签数据及其对应字典值表
customer_tags = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/kyc特征/富高客群KYC标签数据.xlsx')
tag_dict = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/kyc特征/标签字典值.xlsx')

In [25]:
# 构建字典映射
tag_mapping = dict(zip(tag_dict['tag_value_id'], tag_dict['tag_value_name']))
# 记录未被替换的编码值
unmatched_values = set()

In [26]:
# 替换编码值为具体取值
def replace_tag_value(value):
    if pd.isnull(value):
        return value  
    if value in tag_mapping:
        return tag_mapping[value]
    else:
        unmatched_values.add(value)
        return value  

# 遍历数据表并替换值
columns_to_replace = customer_tags.columns[3:]   #表结构前三列为tradedate, customer_id, sub_company_code;从第四列开始替换编码值
for column in columns_to_replace:
    customer_tags[column] = customer_tags[column].apply(replace_tag_value)

In [27]:
# 输出未替换的编码值统计信息
print(f"共有 {len(unmatched_values)} 个编码值没有被替换:")
print(unmatched_values)

共有 0 个编码值没有被替换:
set()


In [28]:
# 保存替换后的结果
customer_tags.to_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/kyc特征/替换后的客群标签数据.xlsx', index=False)

# Part 2. 拼接客户资产汇总表、客户交易汇总表、客户基本信息表

### 注意此处拼接的三个表与kyc标签表应同一时点

In [29]:
# 读取载入客户基本信息表、客户交易汇总表、客户资产汇总表
customer_info = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/数值特征/客户基本信息.xlsx')
customer_transactions = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/数值特征/客户交易汇总.xlsx')
customer_assets = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/数值特征/客户资产汇总.xlsx')

In [30]:
# 拼接数据集
merged_data = customer_tags.merge(customer_info, on='customer_id', how='left', suffixes=('', '_customer_info')) \
                           .merge(customer_transactions, on='customer_id', how='left', suffixes=('', '_customer_transactions')) \
                           .merge(customer_assets, on='customer_id', how='left', suffixes=('', '_customer_assets'))

In [31]:
# 检查拼接后是否有命名一致的字段
all_columns = merged_data.columns
duplicate_columns = all_columns[all_columns.duplicated()].unique()
important_duplicates = [col for col in duplicate_columns if col not in ['customer_no', 'by_month']]

# 输出命名相同的字段
if important_duplicates:
    print(f"命名相同的字段: {important_duplicates}")
else:
    print("没有命名相同的字段（忽略 customer_no 和 by_month）。")

没有命名相同的字段（忽略 customer_no 和 by_month）。


# Part 3. 处理缺失值

In [32]:
# 输出拼接后所有字段的缺失情况
missing_data = merged_data.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data.sort_values(inplace=True, ascending=False)

In [33]:
# 记录每个字段的来源
source_columns = {
    col: 'customer_tags' for col in customer_tags.columns if col not in ['trade_date', 'customer_id', 'customer_no']
}
source_columns.update({
    col: 'customer_info' for col in customer_info.columns if col != 'customer_id'
})
source_columns.update({
    col: 'customer_transactions' for col in customer_transactions.columns if col != 'customer_id'
})
source_columns.update({
    col: 'customer_assets' for col in customer_assets.columns if col != 'customer_id'
})

missing_data_source = {col: source_columns.get(col, 'unknown') for col in missing_data.index}


In [34]:
# 输出缺失情况
print("拼接后所有字段的缺失情况及其来源:")
for col, count in missing_data.items():
    print(f"{col} (来自 {missing_data_source[col]}): 缺失 {count} 条")

拼接后所有字段的缺失情况及其来源:
qw_qytg_push (来自 customer_tags): 缺失 38627 条
risk_label_fund_name (来自 customer_info): 缺失 38627 条
if_qw_cfzb_flag (来自 customer_tags): 缺失 38627 条
buy_pri_1y (来自 customer_tags): 缺失 38627 条
redeem_pub_noncur_cust_7d (来自 customer_tags): 缺失 38627 条
profession_name (来自 customer_info): 缺失 38627 条
status_excp_date (来自 customer_info): 缺失 38627 条
std_date (来自 customer_info): 缺失 38627 条
customer_kind_flag (来自 customer_info): 缺失 38627 条
registfund (来自 customer_info): 缺失 38627 条
end_asset_crdt (来自 customer_info): 缺失 38627 条
begin_date_fund (来自 customer_info): 缺失 38627 条
zfly (来自 customer_tags): 缺失 38627 条
if_pension_flag (来自 customer_tags): 缺失 38627 条
buy_qy_cust_1y (来自 customer_tags): 缺失 38627 条
if_qw_active_flag (来自 customer_tags): 缺失 38627 条
rate_normal (来自 customer_info): 缺失 38627 条
risk_label_fund (来自 customer_info): 缺失 38627 条
if_grylcp_flag (来自 customer_tags): 缺失 38627 条
level_jh_name (来自 customer_info): 缺失 38627 条
buy_pub_noncur_count_1y (来自 customer_tags): 缺失 38627 条
begin_

In [35]:
# 移除完全缺失标签
fields_to_remove = [
    'qw_qytg_push', 'risk_label_fund_name', 'if_qw_cfzb_flag', 'buy_pri_1y', 
    'redeem_pub_noncur_cust_7d', 'profession_name', 'status_excp_date', 'std_date', 
    'customer_kind_flag', 'registfund', 'end_asset_crdt', 'begin_date_fund', 'zfly', 
    'if_pension_flag', 'buy_qy_cust_1y', 'if_qw_active_flag', 'rate_normal', 
    'risk_label_fund', 'if_grylcp_flag', 'level_jh_name', 'buy_pub_noncur_count_1y', 
    'begin_date_future'
]
cleaned_data = merged_data.drop(columns=fields_to_remove)

# 保存移除后的结果
cleaned_data.to_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/training_data/dataset.xlsx', index=False)

"""
总计10个kyc标签因完全缺失被剔除：
    'qw_qytg_push', 'if_qw_cfzb_flag', 'buy_pri_1y', 'redeem_pub_noncur_cust_7d', 
    'zfly', 'if_pension_flag', 'buy_qy_cust_1y', 'if_qw_active_flag', 'if_grylcp_flag', 'buy_pub_noncur_count_1y '

"""

In [36]:
import pandas as pd

# 读取特征数据和标注记录
feature_data = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/training_data/dataset.xlsx')
label_data = pd.read_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/source_data/kyc特征/客户正例.xlsx')

# 初始化label列为0
feature_data['label'] = 0

# 将label_data中的customer_id设置为索引，以加快匹配速度
label_data.set_index('customer_id', inplace=True)

# 根据customer_id进行匹配
matched_count = 0
for i, row in feature_data.iterrows():
    if row['customer_id'] in label_data.index:
        feature_data.at[i, 'label'] = 1
        matched_count += 1

# 保存更新后的特征数据
feature_data.to_excel('/Users/suding/Desktop/Retail_Customer_Group_Analysis/suding/training_data/dataset.xlsx', index=False)

# 检查匹配上的个数是否为5184
print("匹配上的个数:", matched_count)
print("是否匹配上的个数为5518:", matched_count == 5518)


匹配上的个数: 5518
是否匹配上的个数为5518: True
