In [2]:
from data import loader, exporter
from constant import *
import pandas as pd

In [6]:
def process_taxdeclare_data(df):
    # 1. 处理日期字段，确保它们是 %Y%m%d 格式
    df['BTD_DECLARDATE'] = pd.to_datetime(df['BTD_DECLARDATE'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的申报总次数
    taxdeclare_count = df.groupby('CUST_NO').size().reset_index(name='TAXDECLARE_COUNT')

    # 3. 计算每个企业的销售收入总额、平均销售收入和最大销售收入
    sales_stats = df.groupby('CUST_NO')['BTD_TOTALSALE'].agg(['sum', 'mean', 'max']).reset_index()
    sales_stats.columns = ['CUST_NO', 'TOTAL_SALES', 'AVG_SALES', 'MAX_SALES']

    # 4. 计算每个企业的应纳税额总额、平均应纳税额和最大应纳税额
    tax_stats = df.groupby('CUST_NO')['BTD_TAXPAYABLE'].agg(['sum', 'mean', 'max']).reset_index()
    tax_stats.columns = ['CUST_NO', 'TOTAL_TAX', 'AVG_TAX', 'MAX_TAX']

    # 这张表需要非常认真地挖掘
    
    # 合并所有特征
    df_taxdeclare_features = pd.merge(taxdeclare_count, sales_stats, on='CUST_NO', how='left')
    df_taxdeclare_features = pd.merge(df_taxdeclare_features, tax_stats, on='CUST_NO', how='left')
    return df_taxdeclare_features

df_taxdeclare = loader.to_concat_df('XW_ENTINFO_TAXDECLARE')
df_taxdeclare_features = process_taxdeclare_data(df_taxdeclare)

exporter.export_df_to_preprocess('taxdeclare', df_taxdeclare_features)
df_taxdeclare_features.head()

Unnamed: 0,CUST_NO,TAXDECLARE_COUNT,TOTAL_SALES,AVG_SALES,MAX_SALES,TOTAL_TAX,AVG_TAX,MAX_TAX
0,00021b96bf933024181336f4862bd1cb,15,1129.0,75.266667,394.87,104.78,6.985333,49.38
1,000322f5bbe1691a0e9c145a22606556,2,0.0,0.0,0.0,0.0,0.0,0.0
2,00045f4b7419408896d7c13a0eab97da,16,803.56,50.2225,199.12,156.06,9.75375,38.56
3,0007887cd841729063a0bb4248ef892b,8,280.46,35.0575,71.27,109.08,13.635,27.54
4,00079f3acaf3f8926b580d162ae1090f,27,1859.85,68.883333,301.38,387.88,14.365926,58.99
