In [2]:
from data import loader, exporter
from constant import *
import pandas as pd

In [11]:
import pandas as pd

def process_alter_data(df):
    # 1. 处理日期字段， %Y%m%d 格式
    df['ALTDATE'] = pd.to_datetime(df['ALTDATE'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的变更总次数
    alter_count = df.groupby('CUST_NO').size().reset_index(name='ALTER_COUNT')

    # 3. 提取包含关键字的变更条目数量（如“投资人变更”和“法定代表人变更”）
    df['INVESTOR_CHANGE_COUNT'] = df['ALTITEM'].apply(lambda x: 1 if pd.notnull(x) and '投资人变更' in x else 0)
    df['LEGAL_REP_CHANGE_COUNT'] = df['ALTITEM'].apply(lambda x: 1 if pd.notnull(x) and '法定代表人变更' in x else 0)

    investor_change_count = df.groupby('CUST_NO')['INVESTOR_CHANGE_COUNT'].sum().reset_index()
    legal_rep_change_count = df.groupby('CUST_NO')['LEGAL_REP_CHANGE_COUNT'].sum().reset_index()

    # 5. 计算最新变更时间
    latest_alter_date = df.groupby('CUST_NO')['ALTDATE'].max().reset_index()

    # 将最新变更日期与 DATA_DAT 关联
    df_latest_with_date = pd.merge(latest_alter_date, df[['CUST_NO', 'DATA_DAT']].drop_duplicates(), on='CUST_NO', how='left')
    df_latest_with_date['DAYS_SINCE_LAST_ALTER'] = (df_latest_with_date['DATA_DAT'] - df_latest_with_date['ALTDATE']).dt.days

    # 6. 计算变更类型多样性
    alter_type_diversity = df.groupby('CUST_NO')['ALTITEM'].nunique().reset_index(name='ALTER_TYPE_DIVERSITY')

    # 合并所有特征
    df_alter_features = pd.merge(alter_count, df_latest_with_date[['CUST_NO', 'DAYS_SINCE_LAST_ALTER']], on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, alter_type_diversity, on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, investor_change_count, on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, legal_rep_change_count, on='CUST_NO', how='left')

    return df_alter_features


df_alter = loader.to_concat_df('XW_ENTINFO_ALTER')
df_alter_features = process_alter_data(df_alter)


exporter.export_df_to_preprocess('alter', df_alter_features)
df_alter_features.head()

Unnamed: 0,CUST_NO,ALTER_COUNT,DAYS_SINCE_LAST_ALTER,ALTER_TYPE_DIVERSITY,INVESTOR_CHANGE_COUNT,LEGAL_REP_CHANGE_COUNT
0,000034607497713173a75a0d9910cb52,13,564,6,0,1
1,0000dfb26b1e1fb3d96c92eb3f00a3d4,2,1590,2,0,0
2,000345391aea2896517e68e4f73d5d23,9,1206,6,2,0
3,00045f4b7419408896d7c13a0eab97da,8,344,4,0,0
4,00046e5ae9162b4755252da75e750116,8,493,7,0,1
