In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore") 

In [3]:
train_data = pd.read_csv('train_all.csv',nrows=None)
test_data = pd.read_csv('test_all.csv',nrows=None)

In [4]:
features_columns = [col for col in train_data.columns if col not in ['user_id','label']]
train = train_data[features_columns].values
test = test_data[features_columns].values
target =train_data['label'].values

## 缺失值补全

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_imputer = imputer.fit_transform(train)
test_imputer = imputer.fit_transform(test)

## 特征选取

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def feature_selection(train, train_sel, target):
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=2, n_jobs=-1)
    scores = cross_val_score(clf, train, target, cv=5)
    scores = cross_val_score(clf, train_sel, target, cv=5)
    print("No Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))     
    print("Features Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### 删除方差较小的要素

In [8]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.8*(1-0.8)))
train_sel = sel.fit_transform(train)
test_sel = sel.fit_transform(test)

print(train.shape, train_sel.shape)

(2000, 229) (2000, 25)


In [9]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


### 基于统计特征选择

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

train_sel = SelectKBest(score_func=mutual_info_classif, k=2).fit(train, target).transform(train)
test_sel = SelectKBest(score_func=mutual_info_classif, k=2).fit(train, target).transform(test)
print(train.shape, train_sel.shape)

(2000, 229) (2000, 2)


In [11]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


### 递归消除

In [12]:
from sklearn.feature_selection import RFECV
clf = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=2, n_jobs=-1)
selector = RFECV(clf, step=1, cv=2).fit(train, target)
print(selector.support_)
print(selector.ranking_)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

### LR拟合的参数进行变量选择

In [16]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer = normalizer.fit(train)  

train_norm = normalizer.transform(train)                            
test_norm = normalizer.transform(test)
LR = LogisticRegression(penalty='l2', C=5).fit(train_norm, target)
model = SelectFromModel(LR, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print(train.shape, train_sel.shape)

(2000, 229) (2000, 19)


### 树模型特征选择

In [17]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train, target)

model = SelectFromModel(clf, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('训练数据未特征筛选维度', train.shape)
print('训练数据特征筛选维度后', train_sel.shape)

训练数据未特征筛选维度 (2000, 229)
训练数据特征筛选维度后 (2000, 67)


In [18]:
clf.feature_importances_[:10]

array([0.08697029, 0.01649968, 0.01003274, 0.01530839, 0.01586816,
       0.0154483 , 0.01643332, 0.01468384, 0.01823924, 0.0072416 ])

In [19]:
df_features_import = pd.DataFrame()
df_features_import['features_import'] = clf.feature_importances_
df_features_import['features_name']= features_columns
df_features_import.sort_values(['features_import'], ascending=False).head(30)

Unnamed: 0,features_import,features_name
0,0.08697,merchant_id
228,0.075934,xgb_clf
227,0.07076,lgb_clf
20,0.019033,brand_most_1_cnt
15,0.018472,cat_most_1
8,0.018239,time_stamp_nunique
18,0.018227,seller_most_1_cnt
12,0.016877,time_stamp_std
23,0.016666,user_cnt_1
1,0.0165,age_range


### lgb特征的重要性

In [None]:
import lightgbm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.4, random_state=0)

clf = lightgbm

train_matrix = clf.Dataset(X_train, label=y_train)
test_matrix = clf.Dataset(X_test, label=y_test)
params = {
          'boosting_type': 'gbdt',
          #'boosting_type': 'dart',
          'objective': 'multiclass',
          'metric': 'multi_logloss',
          'min_child_weight': 1.5,
          'num_leaves': 2**5,
          'lambda_l2': 10,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'learning_rate': 0.03,
          'tree_method': 'exact',
          'seed': 2017,
          "num_class": 2,
          'silent': True,
          }
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params, 
                  train_matrix,
                  num_round,
                  valid_sets=test_matrix,
                  early_stopping_rounds=early_stopping_rounds)

In [None]:
model.feature_importance()[:10]
feature_selection(train, train_sel, target)