In [2]:
from data import loader, exporter
from constant import *
import pandas as pd

In [3]:
df_alter = loader.to_concat_df('XW_ENTINFO_ALTER')

In [6]:
altitem_counts = df_alter['ALTITEM'].value_counts().reset_index()
altitem_counts.columns = ['ALTITEM', 'Count']
altitem_counts

Unnamed: 0,ALTITEM,Count
0,经营范围变更,39017
1,投资人变更(包括出资额、出资方式、出资日期、投资人名称等),32176
2,经营范围变更(含业务范围变更),29563
3,章程备案,26404
4,投资人(股权)变更,23484
...,...,...
451,外国公司境内法律文书送达人备案,1
452,外方实到资本,1
453,其他变更事项,1
454,出资方式,1


In [10]:
comprehensive_mappings = {
    # 合并经营范围相关
    "经营范围变更(含业务范围变更)": "经营范围变更",
    "经营(业务)范围变更": "经营范围变更",
    "业务范围变更": "经营范围变更",

    # 合并投资人和股权相关
    "股东变更": "投资人变更",
    "投资人(股权)变更": "投资人变更",
    "投资人(股权内部转让)备案": "投资人变更",
    "股东或股份发起人改变姓名或名称变更": "投资人变更",
    "股权转让": "投资人变更",
    "投资人(包括出资额、出资方式、出资日期、投资人名称等)":"投资人变更",
    "投资人变更(包括出资额、出资方式、出资日期、投资人名称等)":"投资人变更",
    "股东、投资人变更(股权转让)":"投资人变更",
    
    # 投资人信息变更
    "投资人信息变更(包括投资人、出资额、出资方式、出资日期、投资人名称等)":"投资人信息变更",
    "投资者名称(姓名)变更":"投资人信息变更",
    "股东或股份发起人改变姓名或名称":"投资人信息变更",
    
    # 合并名称变更相关
    "名称变更(字号名称、集团名称等)": "名称变更",
    "名称变更(字号名称、集团名称等)变更": "名称变更",
    
    # 合并法定代表人相关
    "负责人变更": "法定代表人变更",
    "负责人变更(法定代表人、负责人、首席代表、合伙事务执行人等变更)": "法定代表人变更",
    "法定代表人(负责人、独资投资人)": "法定代表人变更",
    "负责人变更(法定代表人、负责人、首席代表、个体户经营者、投资人、合伙事务执行人等变更)": "法定代表人变更",
    
    # 合并地址变更相关
    "住所变更": "地址变更",
    "地址变更(住所地址、经营场所、驻在地址等变更)": "地址变更",
    "住所(经营场所)变更": "地址变更",
    
    # 合并注册资本相关
    "注册资本变更(注册资金、资金数额等变更)": "注册资本变更",
    "注册资本(金)变更": "注册资本变更",
    "实收资本变更": "注册资本变更",
    
    # 合并监事、董事等高级管理人员变更
    "高级管理人员备案(董事、监事、经理等)": "高级管理人员备案",
    "董事长变更": "董事变更",
    "监事变更": "监事备案",
    
    # 合并经营期限相关
    "期限变更(经营期限、营业期限、驻在期限等变更)": "经营期限变更",
    "经营期限(营业期限)变更": "经营期限变更"
}

df_alter['ALTITEM'] = df_alter['ALTITEM'].replace(comprehensive_mappings)
altitem_counts = df_alter['ALTITEM'].value_counts().reset_index()
altitem_counts.columns = ['ALTITEM', 'Count']
altitem_counts

Unnamed: 0,ALTITEM,Count
0,投资人变更,76862
1,经营范围变更,69599
2,地址变更,42136
3,注册资本变更,36348
4,法定代表人变更,35289
...,...,...
424,有限责任公司股东认缴出资额备案,1
425,投资人变更（包括出资额、出资方式、出资日期、投资人名称等）,1
426,营业场所,1
427,财务负责人变更,1


In [12]:
def process_alter_data(df):
    # 1. 处理日期字段， %Y%m%d 格式
    df['ALTDATE'] = pd.to_datetime(df['ALTDATE'], format='%Y%m%d', errors='coerce')
    df['DATA_DAT'] = pd.to_datetime(df['DATA_DAT'], format='%Y%m%d', errors='coerce')

    # 2. 计算每个企业的变更总次数
    alter_count = df.groupby('CUST_NO').size().reset_index(name='ALTER_COUNT')

    # 定义时间窗口（以天为单位）
    time_windows = {
        '1M': 30,   # 1个月
        '3M': 90,   # 3个月
        '6M': 180,  # 半年
        '12M': 365, # 1年
        '3Y': 1095  # 3年
    }

    # 3. 计算各时间窗口内的变更次数
    for window, days in time_windows.items():
        df[f'ALTER_COUNT_{window}'] = df.apply(lambda row: 1 if (row['DATA_DAT'] - row['ALTDATE']).days <= days else 0, axis=1)

    # 汇总每个时间窗口的变更次数
    window_counts = df.groupby('CUST_NO')[[f'ALTER_COUNT_{w}' for w in time_windows]].sum().reset_index()

    # 计算比例特征
    window_counts['RATIO_6M_TO_12M'] = window_counts['ALTER_COUNT_6M'] / window_counts['ALTER_COUNT_12M']
    window_counts['RATIO_12M_TO_3Y'] = window_counts['ALTER_COUNT_12M'] / window_counts['ALTER_COUNT_3Y']
    window_counts['RATIO_1M_TO_6M'] = window_counts['ALTER_COUNT_1M'] / window_counts['ALTER_COUNT_6M']

    # 4. 提取关键字变更次数
    keywords = ['投资人变更', '法定代表人变更', '经营范围变更', '住所变更']
    for keyword in keywords:
        df[f'{keyword}_CHANGE'] = df['ALTITEM'].apply(lambda x: 1 if pd.notnull(x) and keyword in x else 0)
        
        # 统计各时间窗口的关键字变更次数
        for window, days in time_windows.items():
            df[f'{keyword}_CHANGE_{window}'] = df.apply(
                lambda row: 1 if row[f'{keyword}_CHANGE'] == 1 and (row['DATA_DAT'] - row['ALTDATE']).days <= days else 0, axis=1
            )

    # 汇总每个关键字在各时间窗口的变更次数
    keyword_counts = df.groupby('CUST_NO')[[f'{kw}_CHANGE_{w}' for kw in keywords for w in time_windows]].sum().reset_index()

    # 5. 计算最新变更时间
    latest_alter_date = df.groupby('CUST_NO')['ALTDATE'].max().reset_index()

    # 将最新变更日期与 DATA_DAT 关联
    df_latest_with_date = pd.merge(latest_alter_date, df[['CUST_NO', 'DATA_DAT']].drop_duplicates(), on='CUST_NO', how='left')
    df_latest_with_date['DAYS_SINCE_LAST_ALTER'] = (df_latest_with_date['DATA_DAT'] - df_latest_with_date['ALTDATE']).dt.days

    # 6. 计算变更类型多样性
    alter_type_diversity = df.groupby('CUST_NO')['ALTITEM'].nunique().reset_index(name='ALTER_TYPE_DIVERSITY')

    # 合并所有特征
    df_alter_features = pd.merge(alter_count, df_latest_with_date[['CUST_NO', 'DAYS_SINCE_LAST_ALTER']], on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, alter_type_diversity, on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, window_counts, on='CUST_NO', how='left')
    df_alter_features = pd.merge(df_alter_features, keyword_counts, on='CUST_NO', how='left')

    return df_alter_features


In [14]:
df_alter_features = process_alter_data(df_alter)

exporter.export_df_to_preprocess('alter', df_alter_features)
df_alter_features.head()

Unnamed: 0,CUST_NO,ALTER_COUNT,DAYS_SINCE_LAST_ALTER,ALTER_TYPE_DIVERSITY,ALTER_COUNT_1M,ALTER_COUNT_3M,ALTER_COUNT_6M,ALTER_COUNT_12M,ALTER_COUNT_3Y,RATIO_6M_TO_12M,...,经营范围变更_CHANGE_1M,经营范围变更_CHANGE_3M,经营范围变更_CHANGE_6M,经营范围变更_CHANGE_12M,经营范围变更_CHANGE_3Y,住所变更_CHANGE_1M,住所变更_CHANGE_3M,住所变更_CHANGE_6M,住所变更_CHANGE_12M,住所变更_CHANGE_3Y
0,000034607497713173a75a0d9910cb52,13,564,6,0,0,0,0,4,,...,0,0,0,0,0,0,0,0,0,0
1,0000dfb26b1e1fb3d96c92eb3f00a3d4,2,1590,2,0,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
2,000345391aea2896517e68e4f73d5d23,9,1206,6,0,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
3,00045f4b7419408896d7c13a0eab97da,8,344,4,0,0,0,1,1,0.0,...,0,0,0,0,0,0,0,0,0,0
4,00046e5ae9162b4755252da75e750116,8,493,7,0,0,0,0,2,,...,0,0,0,0,0,0,0,0,0,0
