In [1]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
def process_yrpinfo_data(df):
    # 1. 处理日期字段，确保它们是 %Y%m%d 格式
    df['ANCHEDATE'] = pd.to_datetime(df['ANCHEDATE'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业年报记录中的网站总数量
    web_count = df.groupby('CUST_NO').size().reset_index(name='WEB_COUNT')

    # 3. 统计不同类型网站的数量
    web_type_count = df.groupby(['CUST_NO', 'WEBTYPE']).size().unstack(fill_value=0).reset_index()
    web_type_count.columns = [f"WEBTYPE_{col}" if col != 'CUST_NO' else 'CUST_NO' for col in web_type_count.columns]

    # 4. 计算最新年报日期与当前日期的差值
    latest_anchdate = df.groupby('CUST_NO')['ANCHEDATE'].max().reset_index()
    latest_anchdate_with_date = pd.merge(latest_anchdate, df[['CUST_NO', 'DATA_DAT']].drop_duplicates(), on='CUST_NO', how='left')
    latest_anchdate_with_date['DAYS_SINCE_LAST_YRP'] = (latest_anchdate_with_date['DATA_DAT'] - latest_anchdate_with_date['ANCHEDATE']).dt.days

    # 5. 统计每个企业的不同网站名称数量（多样性）
    web_name_diversity = df.groupby('CUST_NO')['WEBSITNAME'].nunique().reset_index(name='WEB_NAME_DIVERSITY')

    # 合并所有特征
    df_yrpinfo_features = pd.merge(web_count, web_type_count, on='CUST_NO', how='left')
    df_yrpinfo_features = pd.merge(df_yrpinfo_features, latest_anchdate_with_date[['CUST_NO', 'DAYS_SINCE_LAST_YRP']], on='CUST_NO', how='left')
    df_yrpinfo_features = pd.merge(df_yrpinfo_features, web_name_diversity, on='CUST_NO', how='left')

    return df_yrpinfo_features

# 使用示例
df_yrpinfo = loader.to_concat_df('XW_ENTINFO_YRPINFO')
df_yrpinfo_features = process_yrpinfo_data(df_yrpinfo)

exporter.export_df_to_preprocess('yrpinfo', df_yrpinfo_features)

df_yrpinfo_features.head()

Unnamed: 0,CUST_NO,WEB_COUNT,WEBTYPE_网店,WEBTYPE_网站,DAYS_SINCE_LAST_YRP,WEB_NAME_DIVERSITY
0,0012d00143028f04380e0465eac333eb,42,34.0,8.0,73,26
1,0017bb4f9a28639e91ac2661c009311d,1,0.0,1.0,1162,1
2,002083e9afa4acd933d872ce598dc965,6,0.0,6.0,134,1
3,0021db9e3ad0aa51099ddc2a1e4d4297,4,0.0,4.0,1517,2
4,00251d46d4529b1fcdf84744fde185ce,8,0.0,8.0,788,1
