In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
df_person = loader.to_concat_df('XW_ENTINFO_PERSON')

In [4]:
def process_person_data(df):
    # 1. 处理日期字段，确保它们是 %Y%m%d 格式
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的高管总人数
    person_count = df.groupby('CUST_NO').size().reset_index(name='PERSON_COUNT')

    # 3. 计算高管职位多样性
    position_diversity = df.groupby('CUST_NO')['POSITIONCODE'].nunique().reset_index(name='POSITION_DIVERSITY')

    # 4. 统计重复任职的高管人数
    df['PERNAME_COUNT'] = df.groupby(['CUST_NO', 'PERNAME'])['PERNAME'].transform('count')
    repeated_person_count = df[df['PERNAME_COUNT'] > 1].groupby('CUST_NO').size().reset_index(name='REPEATED_PERSON_COUNT')

    # 5. 统计 `PERSONAMOUNT` 大于 3 和大于 5 的次数
    person_amount_gt3 = df[df['PERSONAMOUNT'] > 3].groupby('CUST_NO').size().reset_index(name='PERSONAMOUNT_GT3_COUNT')
    person_amount_gt5 = df[df['PERSONAMOUNT'] > 5].groupby('CUST_NO').size().reset_index(name='PERSONAMOUNT_GT5_COUNT')

    # 6. 计算各比例特征
    total_person_amount = df.groupby('CUST_NO').size().reset_index(name='TOTAL_PERSONAMOUNT_COUNT')
    person_amount_gt3['RATIO_GT3'] = person_amount_gt3['PERSONAMOUNT_GT3_COUNT'] / total_person_amount['TOTAL_PERSONAMOUNT_COUNT']
    person_amount_gt5['RATIO_GT5'] = person_amount_gt5['PERSONAMOUNT_GT5_COUNT'] / total_person_amount['TOTAL_PERSONAMOUNT_COUNT']

    # 7. 总人数与职位多样性的比值
    position_diversity['RATIO_PERSON_POSITION'] = person_count['PERSON_COUNT'] / position_diversity['POSITION_DIVERSITY']

    # 8. 独立高管人数占总人数的比值
    unique_person_count = df.groupby('CUST_NO')['PERNAME'].nunique().reset_index(name='UNIQUE_PERSON_COUNT')
    unique_person_count['RATIO_UNIQUE_PERSON'] = unique_person_count['UNIQUE_PERSON_COUNT'] / person_count['PERSON_COUNT']

    # 9. 合并所有特征
    df_person_features = pd.merge(person_count, position_diversity[['CUST_NO', 'POSITION_DIVERSITY', 'RATIO_PERSON_POSITION']], on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, repeated_person_count, on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, person_amount_gt3, on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, person_amount_gt5, on='CUST_NO', how='left')
    df_person_features = pd.merge(df_person_features, unique_person_count[['CUST_NO', 'UNIQUE_PERSON_COUNT', 'RATIO_UNIQUE_PERSON']], on='CUST_NO', how='left')

    # 填充空值为 0（例如某些企业可能没有重复任职或 `PERSONAMOUNT` 大于 3 和 5 的情况）
    df_person_features.fillna(0, inplace=True)

    return df_person_features




In [5]:
df_person_features = process_person_data(df_person)

exporter.export_df_to_preprocess('person',df_person_features)
df_person_features.head()

Unnamed: 0,CUST_NO,PERSON_COUNT,POSITION_DIVERSITY,RATIO_PERSON_POSITION,REPEATED_PERSON_COUNT,PERSONAMOUNT_GT3_COUNT,RATIO_GT3,PERSONAMOUNT_GT5_COUNT,RATIO_GT5,UNIQUE_PERSON_COUNT,RATIO_UNIQUE_PERSON
0,000034607497713173a75a0d9910cb52,3,3,1.0,2.0,0.0,0.0,0.0,0.0,2,0.666667
1,0000dfb26b1e1fb3d96c92eb3f00a3d4,3,3,1.0,0.0,0.0,0.0,0.0,0.0,3,1.0
2,0001f54424332282c4ee1dd10ef43e67,3,3,1.0,2.0,0.0,0.0,0.0,0.0,2,0.666667
3,0001f87942183f01cee6998d65092fe9,3,3,1.0,0.0,0.0,0.0,0.0,0.0,3,1.0
4,00021b96bf933024181336f4862bd1cb,3,3,1.0,2.0,0.0,0.0,0.0,0.0,2,0.666667
