In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
def process_person_data(df):
    # 1. 处理日期字段，确保它们是 %Y%m%d 格式
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的高管总人数
    person_count = df.groupby('CUST_NO').size().reset_index(name='PERSON_COUNT')

    # 3. 统计不同职位的高管人数
    position_count = df.groupby(['CUST_NO', 'POSITIONCODE']).size().unstack(fill_value=0).reset_index()
    position_count.columns = [f"POSITION_{col}" if col != 'CUST_NO' else 'CUST_NO' for col in position_count.columns]

    # 4. 计算高管职位多样性
    position_diversity = df.groupby('CUST_NO')['POSITIONCODE'].nunique().reset_index(name='POSITION_DIVERSITY')

    # 5. 统计重复任职的高管人数
    df['PERNAME_COUNT'] = df.groupby(['CUST_NO', 'PERNAME'])['PERNAME'].transform('count')
    repeated_person_count = df[df['PERNAME_COUNT'] > 1].groupby('CUST_NO').size().reset_index(name='REPEATED_PERSON_COUNT')

    # 6. 合并所有特征
    df_person_features = pd.merge(person_count, position_count, on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, position_diversity, on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, repeated_person_count, on='CUST_NO', how='left')

    # 填充空值为0（例如某些企业可能没有重复任职的情况）
    df_person_features['REPEATED_PERSON_COUNT'].fillna(0, inplace=True)

    return df_person_features


df_person = loader.to_concat_df('XW_ENTINFO_PERSON')
df_person_features = process_person_data(df_person)

exporter.export_df_to_preprocess('person',df_person_features)
df_person_features.head()

Unnamed: 0,CUST_NO,PERSON_COUNT,POSITION_042160c9f6948919cd615cb044e5ee1c,POSITION_0d95e39be30a1538562416756e4a0e42,POSITION_1d15708e2c42cc5ebabc515490243dc0,POSITION_2208c0826463c1e99ea8030c9c65df4f,POSITION_236f15ad3772b0c93f2c68e7529d6e97,POSITION_25b47d6287e5aadb9ebe518df1c6b838,POSITION_311c2882f60e184d2a27c43bd3e7018f,POSITION_454cbd7f594fee07cc74471428cbdae2,...,POSITION_a73bed4299afc56f392e43fbcad147e2,POSITION_d77fec90bcbe39182830edb1fbdae905,POSITION_dc5f68ad345c4392c34279bea34a4d8b,POSITION_ea86202bde6e85e80fa2b59ef7cd79f4,POSITION_eda8f36923b3d8671889ee17aed27723,POSITION_f45b29c3721c2a773e20ce7cfe59981f,POSITION_fcf8592153e7a164777ab8c1024ea762,POSITION_fd3610b277f1990d6052625d9eb2be56,POSITION_DIVERSITY,REPEATED_PERSON_COUNT
0,000034607497713173a75a0d9910cb52,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3,2.0
1,0000dfb26b1e1fb3d96c92eb3f00a3d4,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3,0.0
2,0001f54424332282c4ee1dd10ef43e67,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,2.0
3,0001f87942183f01cee6998d65092fe9,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3,0.0
4,00021b96bf933024181336f4862bd1cb,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3,2.0
