In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [2]:
def process_punished_data(df):
    # 1. 处理日期字段， %Y%m%d 格式
    df['REGDATECLEAN'] = pd.to_datetime(df['REGDATECLEAN'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的被执行案件总次数
    punished_count = df.groupby('CUST_NO').size().reset_index(name='PUNISHED_COUNT')

    # 3. 计算被执行总金额、平均金额和最大金额
    punished_amount_stats = df.groupby('CUST_NO')['EXECMONEY'].agg(['sum', 'mean', 'max']).reset_index()
    punished_amount_stats.columns = ['CUST_NO', 'PUNISHED_AMOUNT_TOTAL', 'PUNISHED_AMOUNT_AVG', 'PUNISHED_AMOUNT_MAX']

    # 4. 计算最近被执行日期与当前日期的差值
    latest_punished_date = df.groupby('CUST_NO')['REGDATECLEAN'].max().reset_index()
    latest_punished_with_date = pd.merge(latest_punished_date, df[['CUST_NO', 'DATA_DAT']].drop_duplicates(), on='CUST_NO', how='left')
    latest_punished_with_date['DAYS_SINCE_LAST_PUNISHED'] = (latest_punished_with_date['DATA_DAT'] - latest_punished_with_date['REGDATECLEAN']).dt.days

    # 5. 统计每个企业涉及的法院数量
    court_diversity = df.groupby('CUST_NO')['COURTNAME'].nunique().reset_index(name='COURT_DIVERSITY')

    # 合并所有特征
    df_punished_features = pd.merge(punished_count, punished_amount_stats, on='CUST_NO', how='left')
    df_punished_features = pd.merge(df_punished_features, latest_punished_with_date[['CUST_NO', 'DAYS_SINCE_LAST_PUNISHED']], on='CUST_NO', how='left')
    df_punished_features = pd.merge(df_punished_features, court_diversity, on='CUST_NO', how='left')

    return df_punished_features


df_punished = loader.to_concat_df('XW_ENTINFO_PUNISHED')
df_punished_features = process_punished_data(df_punished)

exporter.export_df_to_preprocess('punished', df_punished_features)
df_punished_features.head()

Unnamed: 0,CUST_NO,PUNISHED_COUNT,PUNISHED_AMOUNT_TOTAL,PUNISHED_AMOUNT_AVG,PUNISHED_AMOUNT_MAX,DAYS_SINCE_LAST_PUNISHED,COURT_DIVERSITY
0,00118b125932d2800074338c0a4cb1fe,1,90.26,90.26,90.26,34,1
1,0017bb4f9a28639e91ac2661c009311d,1,32.02,32.02,32.02,46,1
2,005dc4db38ca49f7c6a1373a122939b5,1,99.91,99.91,99.91,74,1
3,01469eb0d9a8d23c356d4e4de62520de,1,67.8,67.8,67.8,16,1
4,02790bcb4edcc344072be0744dd3ac8b,1,59.41,59.41,59.41,44,1
