# import包

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm



# 读取数据

In [2]:
train = pd.read_csv('./BCW_train.csv')
test  = pd.read_csv('./BCW_test.csv')

In [3]:
train.shape, test.shape

((455, 33), (114, 32))

In [4]:
target = "diagnosis"
used = [x for x in train.columns if x != target]

# 冗余特征过滤

In [5]:
def redundant_feature_filter(df, threshold=0.9):
    corr = df.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = list(df.columns[columns])
    redundant_features = [x for x in df.columns if x not in selected_columns]
    return redundant_features

In [6]:
redundant_features = redundant_feature_filter(train[used])

In [7]:
redundant_features

['perimeter_mean',
 'area_mean',
 'concave points_mean',
 'perimeter_se',
 'area_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'concave points_worst']

# 无效、低效特征过滤

## 方差

In [13]:
def variance_filter(df, threshold=1e-10):
    low_var_features = list(df.columns[df.var() < threshold])
    return low_var_features

In [14]:
data = train[used].append(test)
low_var_features = variance_filter(data)

In [15]:
low_var_features

['f1']

## 线性模型特征重要性

In [16]:
def get_lr_importance(df, used, target):
    
    from sklearn.linear_model import LogisticRegression

    model = LogisticRegression()
    model.fit(df[used], df[target])
    importance = model.coef_[0]
    
    lr_importance = pd.DataFrame(df[used].columns)
    lr_importance.columns = ['feature']
    lr_importance['importance'] = abs(importance)
    lr_importance = lr_importance.sort_values(by = 'importance', ascending = False).reset_index(drop = True)
    
    return lr_importance

In [17]:
used = [x for x in train.columns if x != target]
lr_importance = get_lr_importance(train, used, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
lr_importance

Unnamed: 0,feature,importance
0,radius_worst,1.029468
1,radius_mean,0.978527
2,concavity_worst,0.655604
3,texture_worst,0.542205
4,compactness_worst,0.530433
5,texture_mean,0.467293
6,texture_se,0.401292
7,concavity_mean,0.241625
8,perimeter_mean,0.213727
9,perimeter_worst,0.19595


## 树模型特征重要性

In [19]:
def get_rf_importance(df, used, target):
    
    from sklearn.ensemble import RandomForestClassifier

    model = RandomForestClassifier()
    model.fit(df[used], df[target])
    importance = model.feature_importances_
    
    rf_importance = pd.DataFrame(df[used].columns)
    rf_importance.columns = ['feature']
    rf_importance['importance'] = abs(importance)
    rf_importance = rf_importance.sort_values(by = 'importance', ascending = False).reset_index(drop = True)
    
    return rf_importance

In [20]:
used = [x for x in train.columns if x != target]
rf_importance = get_rf_importance(train, used, target)

In [21]:
rf_importance

Unnamed: 0,feature,importance
0,concave points_worst,0.132459
1,perimeter_worst,0.127706
2,concave points_mean,0.123072
3,radius_worst,0.106986
4,area_worst,0.103829
5,area_se,0.045091
6,concavity_mean,0.042098
7,perimeter_mean,0.040632
8,radius_mean,0.037422
9,concavity_worst,0.036329


## permutation importance

In [23]:
def get_permutation_importance(df, used, target):
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.inspection import permutation_importance

    used = [x for x in df.columns if x != target]
    
    clf = LogisticRegression().fit(df[used], df[target])
    result = permutation_importance(clf, df[used], df[target], n_repeats=10, random_state=0)
    permutation_importance = pd.DataFrame(df[used].columns)
    permutation_importance.columns = ['feature']
    permutation_importance['importance'] = result.importances_mean
    permutation_importance = permutation_importance.sort_values(by = 'importance', ascending = False).reset_index(drop = True)
    permutation_importance
    
    return permutation_importance

In [24]:
used = [x for x in train.columns if x != target]
permutation_importance = get_permutation_importance(train, used, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [25]:
permutation_importance

Unnamed: 0,feature,importance
0,area_worst,0.33956
1,perimeter_worst,0.308132
2,area_mean,0.20989
3,texture_worst,0.161538
4,area_se,0.090549
5,perimeter_mean,0.063736
6,radius_worst,0.052967
7,radius_mean,0.040659
8,texture_mean,0.03978
9,concave points_se,0.0


# 过拟合特征过滤

## null importance

In [28]:
# 原始特征重要性-随机打乱后特征的重要性。验证特征的稳定性

In [74]:
def get_null_importance(df, used, target):
    
    def get_feature_importances(df, used, target, shuffle, seed=None):

        from sklearn.ensemble import RandomForestClassifier

        y = df[target].copy()
        if shuffle:
            y = df[target].copy().sample(frac=1.0)

        model = RandomForestClassifier()
        model.fit(df[used], y)

        imp_df = pd.DataFrame()
        imp_df["feature"] = used
        imp_df["importance"] = model.feature_importances_

        return imp_df

    actual_imp_df = get_feature_importances(df, used, target, shuffle=False)
    
    shuffle_imp_df = pd.DataFrame()
    nb_runs = 50
    for i in tqdm(range(nb_runs), total=nb_runs):
        imp_df = get_feature_importances(df, used, target, shuffle=True)
        imp_df['run'] = i + 1 
        shuffle_imp_df = pd.concat([shuffle_imp_df, imp_df], axis=0)
    
    null_imp_df = pd.DataFrame()
    null_imp_df['feature'] = used
    null_imp_df['importance'] = 0
    for feature in used:
        null_imp_df.loc[null_imp_df['feature'] == feature, 'importance'] = \
            actual_imp_df.loc[actual_imp_df['feature'] == feature, 'importance'].values[0] -\
            shuffle_imp_df.loc[shuffle_imp_df['feature'] == feature, 'importance'].mean()
    
    null_imp_df = null_imp_df.sort_values(by = 'importance', ascending = False).reset_index(drop = True)
    
    return null_imp_df

In [76]:
null_imp_df = get_null_importance(train, used, target)

100%|██████████| 50/50 [00:12<00:00,  4.01it/s]


In [77]:
null_imp_df

Unnamed: 0,feature,importance
0,concave points_worst,0.169434
1,concave points_mean,0.098043
2,perimeter_worst,0.091603
3,radius_worst,0.076341
4,area_worst,0.070148
5,perimeter_mean,0.005599
6,area_mean,0.000811
7,concavity_mean,0.000714
8,f2,0.0
9,f1,0.0


## adversarial validation

In [95]:
def adversarial_validation(train, test, used, target, threshold=0.6):
    
    from sklearn.model_selection import KFold
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_auc_score
    
    train[target] = 0
    test[target] = 1
    train_test = pd.concat([train, test], axis=0)
    
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=889)
    
    removed_features = []
    
    while True:
        print('#' * 50)
        AUCs = []
        feature_importances = pd.DataFrame()
        feature_importances['feature'] = train_test[used].columns
        
        for fold_n, (train_index, valid_index) in enumerate(folds.split(train_test[used])):
            
            model = RandomForestClassifier()
            model.fit(train_test[used].iloc[train_index], train_test[target].iloc[train_index])
            
            feature_importances['fold_{}'.format(fold_n + 1)] = model.feature_importances_
            
            val = model.predict(train_test[used].iloc[valid_index])

            auc_score = roc_auc_score(train_test[target].iloc[valid_index], val)
            AUCs.append(auc_score)
            
        mean_auc = np.mean(AUCs)
        print(f'Mean AUC: {mean_auc}')
        
        feature_importances['average'] = feature_importances[
            [x for x in feature_importances.columns if x != "feature"]].mean(axis=1)
        feature_importances = feature_importances.sort_values(by="average", ascending=False).reset_index(drop = True)
        
        if mean_auc > threshold:
            cur_removed_feature = feature_importances.loc[0, 'feature']
            print(f"remove feature {cur_removed_feature}")
            removed_features.append(cur_removed_feature)
            used = [x for x in used if x not in removed_features]
        else:
            return removed_features
            

In [96]:
removed_features = adversarial_validation(train, test, used, target, threshold=0.6)

##################################################
Mean AUC: 1.0
remove feature f2
##################################################
Mean AUC: 0.4977818627450981


In [97]:
removed_features

['f2']