In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
df_shareholder = loader.to_concat_df('XW_ENTINFO_SHAREHOLDER')
df_shareholder

Unnamed: 0,DATA_DAT,CUST_NO,CONDATE,SUBCONAM,FUNDEDRATIO,INVTYPE,CONFORM,SH_CUST_NO,SRC
0,20020727,69d539b2ceefcfba0a61abd8d6ab06e5,19951027,9.79,7.48%,法人股东,,cf7cfaf15d5fc56881f0540918414297,train
1,20020727,56471159b66326dbc9f5e5cc40e22870,19980721,4.30,1.52%,企业法人,,b14bdf923c7f66a5cdeb8d5a0549fea0,train
2,20020727,c30738102c924a5ddf62504b24201726,19951224,5.01,10.00%,企业法人,货币,a73ef2de936ece1a1bba414c300ab702,train
3,20020727,f638f67583846f98d9a002385b4ee9d4,19980125,21.30,36.13%,企业法人,货币,471cc5134939ebd04beb29d58a7bc189,train
4,20020727,5493548a0f84d1c372b5d1da5f4db920,19980722,16.13,100.00%,企业法人,货币,98135d3ef836288393a1967979afd0c0,train
...,...,...,...,...,...,...,...,...,...
426,20020827,6f95d0d6293094f82abdf196a141eabe,19991224,4.25,0.44%,企业法人,,df9c269debe6aa82b6bc64e5ec5e959f,test
427,20020827,11328f3afe12babb69afe445d20625e7,20000827,47.16,100.00%,机关法人,货币,2fcba49cf1ddee27a97eb4cfabdb528d,test
428,20020827,c3509d4af023dfb729d6b6bdb4d5c44f,19990730,47.75,90.00%,企业法人,货币,1e2c07396861a6aeaf98b6b36d4af6be,test
429,20020827,6f95d0d6293094f82abdf196a141eabe,19991224,5.40,0.89%,合伙企业,,b2cd546c8f2ec5ab575c0a26344d5b53,test


In [8]:
def process_shareholder_data(df):
    # 1. 处理日期字段，确保它们是 %Y%m%d 格式
    df['CONDATE'] = pd.to_datetime(df['CONDATE'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的股东总数量
    shareholder_count = df.groupby('CUST_NO').size().reset_index(name='SHAREHOLDER_COUNT')

    # 3. 计算每个企业的总出资金额、平均出资金额和最大出资金额
    capital_stats = df.groupby('CUST_NO')['SUBCONAM'].agg(['sum', 'mean', 'max']).reset_index()
    capital_stats.columns = ['CUST_NO', 'TOTAL_CAPITAL', 'AVG_CAPITAL', 'MAX_CAPITAL']

    # 4. 计算每个企业的股东出资比例（FUNDEDRATIO）
    df['FUNDEDRATIO'] = df['FUNDEDRATIO'].str.replace('%', '').astype(float) / 100
    ratio_stats = df.groupby('CUST_NO')['FUNDEDRATIO'].agg(['mean', 'max']).reset_index()
    ratio_stats.columns = ['CUST_NO', 'AVG_CONRATIO', 'MAX_CONRATIO']

    # 5. 计算大股东集中度（前5大股东的出资总比例）
    df['RANK'] = df.groupby('CUST_NO')['FUNDEDRATIO'].rank(method='first', ascending=False)
    top5_shareholder_concentration = df[df['RANK'] <= 5].groupby('CUST_NO')['FUNDEDRATIO'].sum().reset_index(name='TOP5_SHAREHOLDER_CONCENTRATION')

    # 合并所有特征
    df_shareholder_features = pd.merge(shareholder_count, capital_stats, on='CUST_NO', how='left')
    df_shareholder_features = pd.merge(df_shareholder_features, ratio_stats, on='CUST_NO', how='left')
    df_shareholder_features = pd.merge(df_shareholder_features, top5_shareholder_concentration, on='CUST_NO', how='left')

    return df_shareholder_features


df_shareholder = loader.to_concat_df('XW_ENTINFO_SHAREHOLDER')
df_shareholder_features = process_shareholder_data(df_shareholder)

exporter.export_df_to_preprocess('shareholder', df_shareholder_features)
df_shareholder_features.head()


Unnamed: 0,CUST_NO,SHAREHOLDER_COUNT,TOTAL_CAPITAL,AVG_CAPITAL,MAX_CAPITAL,AVG_CONRATIO,MAX_CONRATIO,TOP5_SHAREHOLDER_CONCENTRATION
0,0012d00143028f04380e0465eac333eb,4,33.85,8.4625,9.81,0.122,0.18,0.488
1,006d313aa433bbbdc4e6afa40be28de3,2,43.58,21.79,23.26,0.5,0.6,1.0
2,00c01fee8cc149b68cc858f431ce2ea8,2,25.59,12.795,12.88,0.5,0.51,1.0
3,00daf4224013680298528e230b3da236,2,29.62,14.81,14.91,0.5,0.51,1.0
4,013eb4a02f81c1852036eb6f9b9177a7,1,18.46,18.46,18.46,1.0,1.0,1.0
