# import包

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, NMF
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans, SpectralClustering



# 构造数据

In [2]:
df = pd.DataFrame({
    "discrete_var1":np.random.randint(1,5,size=1000),
    "discrete_var2":np.random.randint(1,5,size=1000),
    "continuous_var1":np.random.random(1000).round(2),
    "continuous_var2":np.random.random(1000).round(2),
    "label":np.random.random(1000).round(2) * 100
})

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

train = train.reset_index(drop = True)
test  = test.reset_index(drop = True)

# 目标编码

In [4]:
def fe_target_encoding(train, test, key, label, k = 5):
    oof_train, oof_test = np.zeros(train.shape[0]), np.zeros(test.shape[0]) 
    from sklearn.model_selection import KFold
    skf = KFold(n_splits = k).split(train)
    for i, (train_idx, valid_idx) in enumerate(skf):
        df_train = train[key + [label]].loc[train_idx]
        df_valid = train[key].loc[valid_idx]
        df_map = df_train.groupby(key)[[label]].agg('mean')
        oof_train[valid_idx] = df_valid.merge(df_map, on = key, how = 'left')[label].values
        oof_test += test[key].merge(df_map, on = key, how = 'left')[label].fillna(-1).values / k
    return oof_train, oof_test

In [5]:
key = ['discrete_var1']
label = 'label'
oof_train, oof_test = fe_target_encoding(train, test, key, label, k = 5)

In [6]:
train['target_encoding_discrete_var1'] = oof_train
test['target_encoding_discrete_var1'] = oof_test

In [7]:
train

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,label,target_encoding_discrete_var1
0,4,3,0.92,0.46,29.0,47.768293
1,4,2,0.04,0.59,78.0,47.768293
2,1,2,0.20,0.10,79.0,53.018519
3,4,1,0.97,0.74,67.0,47.768293
4,3,4,0.94,0.01,17.0,54.713333
...,...,...,...,...,...,...
795,4,4,0.48,0.95,96.0,49.451429
796,2,4,0.23,0.36,13.0,51.670968
797,4,3,0.65,0.65,96.0,49.451429
798,1,1,0.40,0.52,87.0,52.733333


In [8]:
test

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,label,target_encoding_discrete_var1
0,3,4,0.87,0.11,81.0,54.431142
1,2,4,0.09,0.14,50.0,52.114083
2,1,1,0.50,0.62,84.0,51.305675
3,3,3,0.61,0.91,37.0,54.431142
4,4,2,0.74,0.73,86.0,48.739171
...,...,...,...,...,...,...
195,3,1,0.57,0.41,82.0,54.431142
196,4,3,0.11,0.14,63.0,48.739171
197,2,2,0.44,0.96,82.0,52.114083
198,4,1,0.57,0.34,5.0,48.739171


# GBDT特征

In [9]:
def fe_gbdt(train, test, used, category, label):
    import lightgbm as lgb
    
    params = {'num_leaves': 41,
              'min_child_weight': 0.03,
              'feature_fraction': 0.3,
              'bagging_fraction': 0.4,
              'min_data_in_leaf': 96,
              'objective': 'binary',
              'max_depth': -1,
              'learning_rate': 0.01,
              "boosting_type": "gbdt",
              "bagging_seed": 11,
              "metric": 'auc',
              "verbosity": -1,
              'reg_alpha': 0.4,
              'reg_lambda': 0.6,
              'random_state': 47,
              'num_threads': -1
             }
    N_round = 30
    
    trn_data = lgb.Dataset(train[used], label=train[label], categorical_feature = category)
    clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data], verbose_eval=10)
    
    train_lgb_feature= pd.DataFrame(clf.predict(train[used], pred_leaf=True))
    test_lgb_feature= pd.DataFrame(clf.predict(test[used], pred_leaf=True))
    
    tree_feas = ["gbdt_" + str(i) for i in range(1, N_round + 1)]
    train_lgb_feature.columns = tree_feas
    test_lgb_feature.columns = tree_feas
    
    return train_lgb_feature, test_lgb_feature

In [10]:
used = ["discrete_var1", "discrete_var2", "continuous_var1", "continuous_var2"]
category = ["discrete_var1", "discrete_var2"]
label = 'label'

train_lgb_feature, test_lgb_feature = fe_gbdt(train, test, used, category, label)

[10]	training's auc: 0.98183
[20]	training's auc: 0.983083
[30]	training's auc: 0.983083




In [11]:
train_lgb_feature

Unnamed: 0,gbdt_1,gbdt_2,gbdt_3,gbdt_4,gbdt_5,gbdt_6,gbdt_7,gbdt_8,gbdt_9,gbdt_10,...,gbdt_21,gbdt_22,gbdt_23,gbdt_24,gbdt_25,gbdt_26,gbdt_27,gbdt_28,gbdt_29,gbdt_30
0,0,2,0,0,0,0,0,2,0,2,...,0,0,2,0,2,0,0,0,0,0
1,1,0,1,0,0,1,1,0,0,0,...,1,1,0,0,0,1,1,1,0,0
2,1,0,1,0,0,1,1,0,0,0,...,1,1,0,0,0,1,1,1,0,0
3,0,2,0,0,0,0,0,2,0,2,...,0,0,2,0,2,0,0,0,0,0
4,1,2,1,1,1,1,1,2,1,2,...,1,1,2,1,2,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,1,0,1,0,0,1,1,0,0,0,...,1,1,0,0,0,1,1,1,0,0
796,1,0,1,1,1,1,1,0,1,0,...,1,1,0,1,0,1,1,1,1,1
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
test_lgb_feature

Unnamed: 0,gbdt_1,gbdt_2,gbdt_3,gbdt_4,gbdt_5,gbdt_6,gbdt_7,gbdt_8,gbdt_9,gbdt_10,...,gbdt_21,gbdt_22,gbdt_23,gbdt_24,gbdt_25,gbdt_26,gbdt_27,gbdt_28,gbdt_29,gbdt_30
0,1,1,1,1,1,1,1,1,1,1,...,1,1,2,1,2,1,1,1,1,1
1,1,0,1,1,1,1,1,0,1,0,...,1,1,0,1,0,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,1
4,1,0,1,0,0,1,1,0,0,0,...,1,1,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,1
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,1,0,1,1,1,1,1,0,1,0,...,1,1,0,1,0,1,1,1,1,1
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
train = pd.concat([train, train_lgb_feature], axis = 1)
test  = pd.concat([test,  test_lgb_feature], axis = 1)

In [14]:
train

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,label,target_encoding_discrete_var1,gbdt_1,gbdt_2,gbdt_3,gbdt_4,...,gbdt_21,gbdt_22,gbdt_23,gbdt_24,gbdt_25,gbdt_26,gbdt_27,gbdt_28,gbdt_29,gbdt_30
0,4,3,0.92,0.46,29.0,47.768293,0,2,0,0,...,0,0,2,0,2,0,0,0,0,0
1,4,2,0.04,0.59,78.0,47.768293,1,0,1,0,...,1,1,0,0,0,1,1,1,0,0
2,1,2,0.20,0.10,79.0,53.018519,1,0,1,0,...,1,1,0,0,0,1,1,1,0,0
3,4,1,0.97,0.74,67.0,47.768293,0,2,0,0,...,0,0,2,0,2,0,0,0,0,0
4,3,4,0.94,0.01,17.0,54.713333,1,2,1,1,...,1,1,2,1,2,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,4,4,0.48,0.95,96.0,49.451429,1,0,1,0,...,1,1,0,0,0,1,1,1,0,0
796,2,4,0.23,0.36,13.0,51.670968,1,0,1,1,...,1,1,0,1,0,1,1,1,1,1
797,4,3,0.65,0.65,96.0,49.451429,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,1,1,0.40,0.52,87.0,52.733333,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
test

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,label,target_encoding_discrete_var1,gbdt_1,gbdt_2,gbdt_3,gbdt_4,...,gbdt_21,gbdt_22,gbdt_23,gbdt_24,gbdt_25,gbdt_26,gbdt_27,gbdt_28,gbdt_29,gbdt_30
0,3,4,0.87,0.11,81.0,54.431142,1,1,1,1,...,1,1,2,1,2,1,1,1,1,1
1,2,4,0.09,0.14,50.0,52.114083,1,0,1,1,...,1,1,0,1,0,1,1,1,1,1
2,1,1,0.50,0.62,84.0,51.305675,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0.61,0.91,37.0,54.431142,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
4,4,2,0.74,0.73,86.0,48.739171,1,0,1,0,...,1,1,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,3,1,0.57,0.41,82.0,54.431142,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
196,4,3,0.11,0.14,63.0,48.739171,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,2,2,0.44,0.96,82.0,52.114083,1,0,1,1,...,1,1,0,1,0,1,1,1,1,1
198,4,1,0.57,0.34,5.0,48.739171,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
