In [1]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from matplotlib import gridspec
import lightgbm as lgb
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold,train_test_split
import json
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./data/new_data/lgbtrain.csv')

In [3]:
dropfea = ['label', 'user_id', 'end_date','launch_seq','playtime_seq','duration_prefer','interact_prefer']
features = [i for i in train.columns if i not in dropfea]

In [4]:
def get_feature_importances(data, shuffle, seed=None):
    # Go over fold and keep track of CV score (train and valid) and feature importances
    features = [i for i in train.columns if i not in ['label', 'user_id', 'end_date','launch_seq','playtime_seq','duration_prefer','interact_prefer']]  #将用户pid也加进去看看效果
    y = data['label'].copy()
    x = data[features]

    if shuffle:
        # Here you could as well use a binomial distribution
         y = data['label'].copy().sample(frac=1.0)
    
    xtrain,xval,ytrain,yval = train_test_split(x,y,random_state=42,test_size=0.25)
    model = lgb.LGBMRegressor(
                          num_leaves=32,
                           max_depth=7,
                           learning_rate=0.08,
                           n_estimators=10000,
                           importance_type='split',
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           objective='regression',
                           metric='mse',
                           device='gpu')
    
     # Fit the model
    model = model.fit(
                xtrain,
                ytrain,
                eval_metric="mse",
                eval_set=[(xval, yval)],
                verbose=200,
                early_stopping_rounds=100
                )
    # Get feature importances
    imp_df = pd.DataFrame()
   
    imp_df["feature"] = list(features)
    #imp_df["importance_gain"] = model.feature_importances_(importance_type='gain')
    imp_df["importance_split"] = model.feature_importances_
    #imp_df['trn_score'] = roc_auc_score(y, model.predict(data[features]))
     
    return imp_df


In [5]:
actual_imp_df = get_feature_importances(data=train, shuffle=False) 
actual_imp_df

[200]	valid_0's l2: 1.6344
[400]	valid_0's l2: 1.63139
[600]	valid_0's l2: 1.63111


Unnamed: 0,feature,importance_split
0,father_id_score,998
1,cast_id_score,721
2,tag_score,660
3,device_type,131
4,device_ram,648
...,...,...
97,interact_prefer_6,86
98,interact_prefer_7,34
99,interact_prefer_8,72
100,interact_prefer_9,56


In [6]:
null_imp_df = pd.DataFrame()
nb_runs = 50
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=train, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
     # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)


Done with   50 of   50 (Spent   2.2 min)

In [7]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
#     f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
#     f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
#     gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
    
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
    feature_scores.append((_f, split_score,))
    
scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', ])
scores_df

Unnamed: 0,feature,split_score
0,father_id_score,5.296315
1,cast_id_score,4.788880
2,tag_score,4.743040
3,device_type,4.875197
4,device_ram,4.527981
...,...,...
97,interact_prefer_6,3.761200
98,interact_prefer_7,3.526361
99,interact_prefer_8,4.276666
100,interact_prefer_9,4.025352


In [8]:
#获得最后筛选完的特征，这里使用的是split 和 gain 都>0的作为入模特征
feats = scores_df['feature'][(scores_df['split_score']>0)].tolist()
print("筛选后剩余的特征：{}".format(feats))

筛选后剩余的特征：['father_id_score', 'cast_id_score', 'tag_score', 'device_type', 'device_ram', 'device_rom', 'sex', 'age', 'education', 'occupation_status', 'territory_score', 'launch_seq_0', 'launch_seq_1', 'launch_seq_2', 'launch_seq_3', 'launch_seq_4', 'launch_seq_5', 'launch_seq_6', 'launch_seq_7', 'launch_seq_8', 'launch_seq_9', 'launch_seq_10', 'launch_seq_11', 'launch_seq_12', 'launch_seq_13', 'launch_seq_14', 'launch_seq_15', 'launch_seq_16', 'launch_seq_17', 'launch_seq_18', 'launch_seq_19', 'launch_seq_20', 'launch_seq_21', 'launch_seq_22', 'launch_seq_23', 'launch_seq_24', 'launch_seq_25', 'launch_seq_26', 'launch_seq_27', 'launch_seq_28', 'launch_seq_29', 'launch_seq_30', 'launch_seq_31', 'playtime_seq_0', 'playtime_seq_1', 'playtime_seq_2', 'playtime_seq_3', 'playtime_seq_4', 'playtime_seq_5', 'playtime_seq_6', 'playtime_seq_7', 'playtime_seq_8', 'playtime_seq_9', 'playtime_seq_10', 'playtime_seq_11', 'playtime_seq_12', 'playtime_seq_13', 'playtime_seq_14', 'playtime_seq_15', 'pl

In [12]:

fats = scores_df['feature'][(scores_df['split_score']<=0)].tolist()
print("筛选的特征：{}".format(fats))

筛选的特征：['duration_prefer_0', 'duration_prefer_13', 'duration_prefer_14', 'duration_prefer_15', 'interact_prefer_5']
