In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
def process_punishbreak_data(df):
    # 1. 处理日期字段， %Y%m%d 格式
    df['PUBLISHDATECLEAN'] = pd.to_datetime(df['PUBLISHDATECLEAN'], format='%Y%m%d', errors='coerce')
    df['REGDATECLEAN'] = pd.to_datetime(df['REGDATECLEAN'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的失信案件总次数
    punishbreak_count = df.groupby('CUST_NO').size().reset_index(name='PUNISHBREAK_COUNT')

    # 3. 计算最近失信案件发布日期与当前日期的差值
    latest_punishbreak_date = df.groupby('CUST_NO')['PUBLISHDATECLEAN'].max().reset_index()
    latest_punishbreak_with_date = pd.merge(latest_punishbreak_date, df[['CUST_NO', 'DATA_DAT']].drop_duplicates(), on='CUST_NO', how='left')
    latest_punishbreak_with_date['DAYS_SINCE_LAST_PUNISHBREAK'] = (latest_punishbreak_with_date['DATA_DAT'] - latest_punishbreak_with_date['PUBLISHDATECLEAN']).dt.days

    # 4. 统计每个企业涉及的法院数量
    court_diversity = df.groupby('CUST_NO')['COURTNAME'].nunique().reset_index(name='PUNISHBREAK_COURT_DIVERSITY')

    # 5. 统计履行情况（如未履行案件的数量和比例）
    df['UNPERFORMED'] = df['PERFORMANCE'].apply(lambda x: 1 if pd.notnull(x) and '未履行' in x else 0)
    unperformed_stats = df.groupby('CUST_NO')['UNPERFORMED'].agg(['sum', 'mean']).reset_index()
    unperformed_stats.columns = ['CUST_NO', 'UNPERFORMED_COUNT', 'UNPERFORMED_RATIO']

    # 合并所有特征
    df_punishbreak_features = pd.merge(punishbreak_count, latest_punishbreak_with_date[['CUST_NO', 'DAYS_SINCE_LAST_PUNISHBREAK']], on='CUST_NO', how='left')
    df_punishbreak_features = pd.merge(df_punishbreak_features, court_diversity, on='CUST_NO', how='left')
    df_punishbreak_features = pd.merge(df_punishbreak_features, unperformed_stats, on='CUST_NO', how='left')

    return df_punishbreak_features


df_punishbreak = loader.to_concat_df('XW_ENTINFO_PUNISHBREAK')
df_punishbreak_features = process_punishbreak_data(df_punishbreak)

exporter.export_df_to_preprocess('punishbreak', df_punishbreak_features)
df_punishbreak_features.head()


Unnamed: 0,CUST_NO,PUNISHBREAK_COUNT,DAYS_SINCE_LAST_PUNISHBREAK,PUNISHBREAK_COURT_DIVERSITY,UNPERFORMED_COUNT,UNPERFORMED_RATIO
0,013c2759feb664c54ead50f6b3e6e6c0,1,2221,1,1,1.0
1,061264b7227fa718e6e9fbe0ebb5b910,1,9,1,1,1.0
2,091388f072a588b6a20e553b5cdda79f,1,1059,1,1,1.0
3,0c936fbeaa2e4b8e5f7d8079dbd5ad0c,1,3,1,1,1.0
4,0fb14d8f7a39b9cf61b310b74d910b9d,3,44,3,3,1.0
