#### III 特征筛选

In [1]:
import os.path
from constant import *
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import shap
from pandas import DataFrame
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, fbeta_score, precision_score, recall_score, roc_curve
from data import loader, exporter
import matplotlib.pyplot as plt

In [4]:
# 特征选择方法：使用不同随机种子进行训练，保留重要特征
def run_feature_importance_analysis(X, y, seeds, num_runs=30):
    """多次运行LGBM模型，统计每个特征的重要性分布"""
    
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = X.columns
    
    for run in range(num_runs):
        # 随机选择一个种子
        seed = np.random.choice(seeds)
        print(f"Run {run + 1}/{num_runs} with seed: {seed}")
        
        # 打乱特征顺序
        shuffled_features = np.random.permutation(X.columns)
        X_shuffled = X[shuffled_features]
        
        # 分割数据集
        X_train, X_valid, y_train, y_valid = train_test_split(X_shuffled, y, test_size=0.2, random_state=seed)
        
        # 定义LightGBM模型
        model = lgb.LGBMClassifier(random_state=seed, n_estimators=1000, learning_rate=0.1)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='auc',
            callbacks=[lgb.log_evaluation(period=100),
                       lgb.early_stopping(stopping_rounds=100)]  
        )
        
        # 获取特征重要性并保存
        fold_importance = model.feature_importances_
        feature_importances[f'run_{run + 1}'] = fold_importance

    # 计算每个特征的重要性分布
    feature_importances['mean_importance'] = feature_importances.iloc[:, 1:].mean(axis=1)
    feature_importances['std_importance'] = feature_importances.iloc[:, 1:].std(axis=1)
    
    return feature_importances



In [59]:
 # 读取数据
df_target = to_concat_df('TARGET')
df_flat = pd.read_csv(f'{dir_process}/df_flat.csv')
df_flat = df_flat.merge(df_target, left_on=['CUST_NO', 'is_train'], right_on=['CUST_NO', 'is_train'], how='inner')


X = df_flat[df_flat['is_train'] == 1]
X.drop(columns=['DATA_DAT', 'CARD_NO', 'CUST_NO','is_train'], inplace=True)
y = X.pop("FLAG")  # 目标标签列

# 随机种子列表
seeds = [42, 101, 202, 303, 404, 505, 606, 707, 808, 909,
         111, 222, 333, 444, 555, 666, 777, 888, 999, 1234,
         4321, 5678, 8765, 1357, 2468, 3698, 1470, 2580, 3690, 9876]

# 执行特征选择
feature_importances = run_feature_importance_analysis(X, y, seeds)

feature_importances


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Run 1/30 with seed: 2468
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.832241	valid_0's binary_logloss: 0.161898
Early stopping, best iteration is:
[39]	valid_0's auc: 0.832371	valid_0's binary_logloss: 0.159082
Run 2/30 with seed: 1357
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.845953	valid_0's binary_logloss: 0.164752
Early stopping, best iteration is:
[25]	valid_0's auc: 0.848437	valid_0's binary_logloss: 0.162876
Run 3/30 with seed: 2580
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.849809	valid_0's binary_logloss: 0.15723
Early stopping, best iteration is:
[28]	valid_0's auc: 0.854047	valid_0's binary_logloss: 0.155393
Run 4/30 with seed: 606
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.847282	valid_0's binary_logloss: 0.16174
Early stopping, best iteration is:
[27]	valid_0's auc: 0.852258	valid_0's binary_logloss: 0.159483

Unnamed: 0,feature,run_1,run_2,run_3,run_4,run_5,run_6,run_7,run_8,run_9,...,run_23,run_24,run_25,run_26,run_27,run_28,run_29,run_30,mean_importance,std_importance
0,NTRL_CUST_AGE,7,2,3,4,1,2,4,1,0,...,5,3,0,0,1,0,9,6,3.100000,3.409301
1,NTRL_CUST_SEX_CD,1,1,0,0,2,5,1,3,2,...,2,3,0,10,3,1,3,0,2.400000,3.536477
2,DAY_FA_BAL,3,2,9,5,0,2,2,0,0,...,6,2,4,3,0,4,9,7,2.933333,2.768072
3,MAVER_FA_BAL,2,3,4,0,1,5,1,1,2,...,3,0,2,0,1,4,5,3,2.200000,1.886796
4,SAVER_FA_BAL,0,1,1,2,1,9,0,4,0,...,1,1,5,7,10,0,0,5,2.700000,2.979374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588,MAVER_FA_minus_SAVER_FA,4,2,1,0,3,0,0,1,0,...,1,0,1,1,0,0,0,1,1.266667,1.504069
589,MAVER_FA_div_YAVER_FA,2,1,6,1,0,0,1,2,0,...,0,0,4,3,1,1,6,1,1.166667,1.714319
590,MAVER_FA_minus_YAVER_FA,0,1,0,0,0,0,0,2,0,...,1,2,4,0,4,0,0,1,0.933333,1.730767
591,SAVER_FA_div_YAVER_FA,0,0,1,5,0,1,0,0,0,...,4,0,1,0,0,0,0,0,0.900000,1.989137


In [1]:
selected_features = feature_importances[(feature_importances['mean_importance'] > 3 and feature_importances['mean_importance']>3 )]['feature'].tolist()
selected_features

NameError: name 'feature_importances' is not defined

['NTRL_CUST_AGE',
 'FUND_IND',
 'MS_IND',
 'DAY_DPSA_div_DAY_TD',
 'DAY_DPSA_minus_SAVER_AUM',
 'MAVER_DPSA_minus_MAVER_TD',
 'MAVER_DPSA_minus_SAVER_TOT_IVST',
 'SAVER_DPSA_minus_MAVER_FA',
 'YAVER_DPSA_div_YAVER_TOT_DP',
 'YAVER_DPSA_minus_YAVER_AUM',
 'YAVER_DPSA_minus_DAY_FA',
 'MAVER_TD_minus_SAVER_TOT_DP',
 'SAVER_TD_div_YAVER_TOT_IVST',
 'YAVER_TD_div_MAVER_FA',
 'DAY_TOT_DP_div_DAY_AUM',
 'DAY_TOT_DP_div_MAVER_AUM',
 'YAVER_TOT_DP_div_MAVER_AUM',
 'SAVER_TOT_IVST_div_MAVER_FA',
 'DAY_AUM_div_SAVER_FA',
 'MAVER_AUM_div_SAVER_FA',
 'MAVER_AUM_div_YAVER_FA',
 'SAVER_AUM_div_YAVER_FA']