加载所有数据集，和LSTM提取特征数据集

In [1]:
import time
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [3]:
all_data = pd.read_csv("C:/data/processing_data/all_data.csv")
all_data = reduce_mem_usage(all_data)


# lstm = pd.read_csv("C:/data/processing_data/LSTM_Feature.csv")
# lstm = reduce_mem_usage(lstm)

Memory usage of dataframe: 1081.56 MB
Memory usage after optimization: 340.48 MB
Decreased by 68.5%


In [4]:
all_data.head(10)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,LSTM_19,LSTM_20,LSTM_21,LSTM_23,LSTM_25,LSTM_26,LSTM_27,LSTM_30,LSTM_31,LSTM_32
0,100002,-0.324463,-0.717773,0.664551,-0.577637,0.14209,-0.478027,-0.166016,-0.505859,-0.149658,...,0.088806,0.164062,0.484619,0.262939,0.162842,0.139404,0.488281,0.624512,0.304199,0.323242
1,100003,-0.324463,-0.717773,-1.504883,-0.577637,0.426758,1.725586,0.592773,1.600586,-1.25293,...,0.0,0.532715,1.517578,0.62207,0.825195,0.0,1.728516,2.693359,1.545898,1.300781
2,100004,3.082031,1.392578,0.664551,-0.577637,-0.427246,-1.15332,-1.404297,-1.089844,-0.783203,...,0.0,0.356934,1.283203,0.556152,0.493408,0.729492,1.445312,2.021484,0.999512,0.88916
3,100006,-0.324463,-0.717773,0.664551,-0.577637,-0.142578,-0.711426,0.177979,-0.651855,-0.929199,...,0.0,0.271973,0.982422,0.650391,0.457764,0.0,1.086914,1.581055,0.726074,0.801758
4,100007,-0.324463,-0.717773,0.664551,-0.577637,-0.199463,-0.213745,-0.361572,-0.067383,0.562988,...,0.0,0.200684,1.047852,0.322754,0.527832,0.0,1.129883,1.547852,0.803223,0.759277
5,100008,-0.324463,-0.717773,0.664551,-0.577637,-0.294434,-0.269531,0.02829,-0.225708,1.079102,...,0.0,0.338135,1.052734,0.354248,0.462158,0.621094,1.134766,1.626953,0.891602,0.897949
6,100009,-0.324463,1.392578,0.664551,0.807129,0.009285,2.388672,0.979004,2.318359,1.079102,...,0.244629,0.418457,1.654297,0.724121,0.611816,0.491943,1.828125,1.81543,1.048828,1.099609
7,100010,-0.324463,1.392578,0.664551,-0.577637,0.806152,2.3125,1.032227,2.683594,-1.283203,...,0.0,0.393555,1.265625,0.714355,0.713867,0.540527,1.34668,2.126953,1.164062,1.132812
8,100011,-0.324463,-0.717773,0.664551,-0.577637,-0.237427,1.044922,0.463623,1.016602,-0.161865,...,0.0,0.3125,1.094727,0.205688,0.506836,0.751953,1.060547,1.671875,0.83252,0.958496
9,100012,3.082031,-0.717773,0.664551,-0.577637,-0.142578,-0.482178,-0.473145,-0.359619,-0.085693,...,0.0,0.305176,1.082031,0.746582,0.452881,0.0,1.239258,1.71875,0.853027,0.817383


# 拆分数据集

In [5]:
def split_train(data,test_ratio):
   #  data = data.iloc[0:20000,:]
    np.random.seed(0)  # 设置随机种子确保实验可重复
    shuffled_indices=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices =shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [6]:
all_data_train, all_data_valtest = split_train(all_data, 0.2)

In [7]:
all_data_train.shape

(246009, 461)

In [8]:
all_data_valtest.shape

(61502, 461)

In [9]:
all_data_val,all_data_test = split_train(all_data_valtest, 0.5)

In [10]:
all_data_val.shape

(30751, 461)

In [11]:
all_data_test.shape

(30751, 461)

# 特征选择方法

In [12]:
from sklearn.feature_selection import SelectKBest,chi2,RFE,SelectFromModel
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 采用6种特征选择方法，输入特征选择结果
def select_feature(data, num):
    X = data.drop(['TARGET'], axis=1)
    y = data.TARGET
    feature_name = X.columns.tolist()
    
    start = time.time()
    # 过滤法，皮尔逊相关系数
    cor_list = []
    # 计算与目标的相关系数
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # 空值使用0代替
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # 选中的名字
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num:]].columns.tolist()
    # 选中为T
    cor_support = [True if i in cor_feature else False for i in feature_name]
    end1 =time.time()
    print("过滤法-皮尔逊相关系数特征选择所用时间：", end1 - start)
    print(str(len(cor_feature)), 'selected features')
    
    # 过滤法-卡方检验
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    end2 = time.time()
    print("过滤法-卡方检验特征选择所用时间：", end2 - end1)
    print(str(len(chi_feature)), 'selected features')
    
    # 包裹式-LR
    LR_selector = RFE(estimator=LogisticRegression(solver='liblinear',max_iter=500), n_features_to_select=num, step=50, verbose=5)
    LR_selector.fit(X, y)
    LR_support = LR_selector.get_support()
    LR_feature = X.loc[:,LR_support].columns.tolist()  # list
    end3 = time.time()
    print("包裹式-LR特征选择所用时间：", end3 - end2)
    print(str(len(LR_feature)), 'selected features')
    
    # 包裹式-RF
    RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=num, step=50, verbose=5)
    RF_selector.fit(X, y)
    RF_support = RF_selector.get_support()
    RF_feature = X.loc[:,RF_support].columns.tolist()
    end4 = time.time()
    print("包裹式-LR特征选择所用时间：", end4 - end3)
    print(str(len(RF_feature)), 'selected features')
    
    # 嵌入式-LR
    embeded_lr_selector = SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=-np.inf, max_features=num)
    embeded_lr_selector.fit(X_norm, y)
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    end5 = time.time()
    print("嵌入式-LR特征选择所用时间：", end5 - end4)
    print(str(len(embeded_lr_feature)), 'selected features')
    
    # 嵌入式-RF
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold=-np.inf, max_features=200)
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    end6 = time.time()
    print("嵌入式-LR特征选择所用时间：", end6 - end5)

    print(str(len(embeded_rf_feature)), 'selected features')
    
    
    # put all selection together
    feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE_LR':LR_support, 'RFE_RF':RF_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
    # 统计结果 
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    # display the top 200
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    feature_selection_df = feature_selection_df.iloc[:num,]
    
    selected_df = feature_selection_df[feature_selection_df['Total'] > 3]
    combination_feature = selected_df.Feature.tolist()
    combination_feature_dataframe = combination_feature + [" "] * (200 - len(combination_feature)) # 不够两百，空值填充
    
    dataframe_select = pd.DataFrame({'Pearson':cor_feature, 'Chi-2':chi_feature, 'RFE_LR':LR_feature, 'RFE_RF':RF_feature, 'embedding_LR':embeded_lr_feature,
                                    'embedding_RF':embeded_rf_feature, 'combination': combination_feature_dataframe})
    
    # 返回投票结果，以及返回选择后的特征列表
    return feature_selection_df,dataframe_select
    # 组合特征选择
    

In [13]:
all_feture_selction_df,all_dataframe_select = select_feature(all_data_train, 200)

过滤法-皮尔逊相关系数特征选择所用时间： 1.83622145652771
200 selected features
过滤法-卡方检验特征选择所用时间： 1.8855607509613037
200 selected features
Fitting estimator with 460 features.
Fitting estimator with 410 features.
Fitting estimator with 360 features.
Fitting estimator with 310 features.
Fitting estimator with 260 features.
Fitting estimator with 210 features.
包裹式-LR特征选择所用时间： 1322.7132577896118
200 selected features
Fitting estimator with 460 features.
Fitting estimator with 410 features.
Fitting estimator with 360 features.
Fitting estimator with 310 features.
Fitting estimator with 260 features.
Fitting estimator with 210 features.
包裹式-LR特征选择所用时间： 2037.9275753498077
200 selected features
嵌入式-LR特征选择所用时间： 295.2207579612732
200 selected features
嵌入式-LR特征选择所用时间： 286.50982332229614
200 selected features


In [14]:

pd.set_option('display.max_rows', None) # 可以显示所有
all_feture_selction_df

Unnamed: 0,Feature,Pearson,Chi-2,RFE_LR,RFE_RF,Logistics,Random Forest,Total
1,PREV_DAYS_DECISION_MEAN,True,True,True,True,True,True,6
2,PREV_CNT_PAYMENT_MEAN,True,True,True,True,True,True,6
3,POS_COUNT,True,True,True,True,True,True,6
4,LSTM_32,True,True,True,True,True,True,6
5,LSTM_31,True,True,True,True,True,True,6
6,LSTM_27,True,True,True,True,True,True,6
7,LSTM_21,True,True,True,True,True,True,6
8,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,True,True,True,True,True,True,6
9,CLOSED_DAYS_CREDIT_MIN,True,True,True,True,True,True,6
10,CLOSED_DAYS_CREDIT_MAX,True,True,True,True,True,True,6


In [16]:
all_feture_selction_df.to_csv("C:/data/processing_data/dataset/all_feture_selction.csv")

In [15]:
all_dataframe_select

Unnamed: 0,Pearson,Chi-2,RFE_LR,RFE_RF,embedding_LR,embedding_RF,combination
0,APPROVED_AMT_DOWN_PAYMENT_MEAN,NAME_CONTRACT_TYPE,SK_ID_CURR,SK_ID_CURR,NAME_CONTRACT_TYPE,SK_ID_CURR,PREV_DAYS_DECISION_MEAN
1,ORGANIZATION_TYPE_Business Entity Type 3,FLAG_OWN_CAR,BURO_DAYS_CREDIT_MIN,AMT_INCOME_TOTAL,FLAG_OWN_CAR,AMT_INCOME_TOTAL,PREV_CNT_PAYMENT_MEAN
2,ACTIVE_DAYS_CREDIT_UPDATE_MEAN,AMT_CREDIT,BURO_DAYS_CREDIT_MAX,AMT_CREDIT,AMT_CREDIT,AMT_CREDIT,POS_COUNT
3,INSTAL_PAYMENT_DIFF_MAX,AMT_GOODS_PRICE,BURO_DAYS_CREDIT_MEAN,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,LSTM_32
4,APPROVED_AMT_GOODS_PRICE_MEAN,REGION_POPULATION_RELATIVE,BURO_DAYS_CREDIT_VAR,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,AMT_GOODS_PRICE,LSTM_31
5,PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN,DAYS_BIRTH,BURO_DAYS_CREDIT_ENDDATE_MIN,REGION_POPULATION_RELATIVE,DAYS_BIRTH,REGION_POPULATION_RELATIVE,LSTM_27
6,CC_AMT_RECEIVABLE_PRINCIPAL_VAR,DAYS_ID_PUBLISH,BURO_DAYS_CREDIT_ENDDATE_MAX,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_BIRTH,LSTM_21
7,APPROVED_AMT_APPLICATION_MEAN,FLAG_EMP_PHONE,BURO_DAYS_CREDIT_ENDDATE_MEAN,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_EMPLOYED,INSTAL_DAYS_ENTRY_PAYMENT_MEAN
8,BURO_MONTHS_BALANCE_SIZE_MEAN,FLAG_WORK_PHONE,BURO_DAYS_CREDIT_UPDATE_MEAN,DAYS_REGISTRATION,DAYS_ID_PUBLISH,DAYS_REGISTRATION,CLOSED_DAYS_CREDIT_MIN
9,CC_AMT_RECIVABLE_VAR,FLAG_PHONE,BURO_AMT_CREDIT_MAX_OVERDUE_MEAN,DAYS_ID_PUBLISH,CNT_FAM_MEMBERS,DAYS_ID_PUBLISH,CLOSED_DAYS_CREDIT_MAX


In [22]:
all_dataframe_select.to_csv("C:/data/processing_data/dataset/all_dataframe_select.csv")

In [13]:
# all_data_train.to_csv("C:/data/processing_data/dataset/all_data_train.csv")
# all_data_val.to_csv("C:/data/processing_data/dataset/all_data_val.csv")
# all_data_test.to_csv("C:/data/processing_data/dataset/all_data_test.csv")

# 读取特征选择结果

In [14]:
all_dataframe_select = pd.read_csv("C:/data/processing_data/dataset/all_dataframe_select.csv")

In [15]:
# 去特征选择后的数据
combination_feature = [name for name in all_dataframe_select['combination'].tolist() if len(name) > 2] # >2 过滤掉空字符
combination_feature.append("TARGET")

In [16]:
select_combination_train = all_data_train.loc[:, combination_feature]
select_combination_train

Unnamed: 0,PREV_DAYS_DECISION_MEAN,PREV_CNT_PAYMENT_MEAN,POS_COUNT,LSTM_32,LSTM_31,LSTM_27,LSTM_21,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,CLOSED_DAYS_CREDIT_MIN,CLOSED_DAYS_CREDIT_MAX,...,APPROVED_AMT_APPLICATION_MEAN,APPROVED_AMT_APPLICATION_MAX,APARTMENTS_MODE,AMT_GOODS_PRICE,ACTIVE_DAYS_CREDIT_VAR,ACTIVE_DAYS_CREDIT_UPDATE_MEAN,ACTIVE_DAYS_CREDIT_ENDDATE_MIN,ACTIVE_AMT_CREDIT_SUM_SUM,ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN,TARGET
87690,-789.500,29.000000,17.0,0.996582,1.125977,1.572266,1.404297,-1880.000,-2276.0,-319.0,...,329150.250000,450000.0,0.126953,-0.225708,1.153778e+06,-167.500000,-2320.0,787788.0,0.000000,0
161396,-655.000,6.000000,7.0,0.780762,0.662109,0.963867,0.953125,-560.000,0.0,0.0,...,170955.000000,170955.0,-0.593262,1.587891,0.000000e+00,0.000000,0.0,0.0,0.000000,0
156427,-235.875,11.335938,12.0,0.786621,0.630859,0.898926,0.912109,-221.750,0.0,0.0,...,106645.500000,130050.0,2.609375,0.103027,0.000000e+00,0.000000,0.0,0.0,0.000000,0
240569,-1065.000,21.000000,25.0,0.812012,0.833008,1.385742,1.238281,-1365.000,-1684.0,-896.0,...,88106.625000,180000.0,0.026871,-1.174805,8.094434e+04,-24.671875,94.0,1237500.0,0.000000,0
89697,-190.000,8.000000,11.0,0.538574,0.262695,0.626465,0.589355,-199.125,0.0,0.0,...,66813.750000,70906.5,-0.593262,-0.201294,0.000000e+00,0.000000,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221162,-895.000,36.000000,32.0,1.241211,1.194336,1.482422,1.340820,-739.500,-1785.0,-692.0,...,998741.250000,1800000.0,0.204102,1.600586,2.959947e+05,-351.000000,559.0,5265000.0,5880.720215,0
110976,-1421.000,13.664062,62.0,0.754395,0.833008,1.148438,1.016602,-1350.000,-1093.0,-119.0,...,221710.203125,976500.0,-0.039703,0.370850,4.018944e+05,-19.328125,-188.0,1268550.0,0.000000,0
79405,-618.000,24.000000,20.0,0.927246,0.735352,0.960449,1.041992,-309.250,-1851.0,-188.0,...,531000.000000,531000.0,-0.593262,-0.664062,9.417800e+04,-543.500000,0.0,137254.5,1350.000000,0
290387,-1263.000,19.000000,75.0,0.899414,0.966797,1.309570,1.247070,-968.500,0.0,0.0,...,376023.375000,1192500.0,-0.593262,0.492676,0.000000e+00,-5.000000,6644.0,360000.0,7429.500000,0


In [17]:
select_combination_train.shape

(246009, 133)

In [18]:
# all_data_val = pd.read_csv("C:/data/processing_data/dataset/all_data_val.csv")
select_combination_val = all_data_val.loc[:, combination_feature]

In [19]:
select_combination_val.shape

(30751, 133)

In [20]:
# all_data_test = pd.read_csv("C:/data/processing_data/dataset/all_data_test.csv")
select_combination_test = all_data_test.loc[:, combination_feature]

In [21]:
select_combination_test.shape

(30751, 133)

In [22]:
select_combination_train_X = select_combination_train.drop(["TARGET"], axis = 1)
select_combination_train_y = select_combination_train["TARGET"]
select_combination_train_X.shape

(246009, 132)

In [23]:
import imblearn
from imblearn.combine import SMOTETomek
start = time.time()
smt.fit(select_combination_train_X, select_combination_train_y)
X_train_resample, y_train_resample = smt.fit_resample(select_combination_train_X, select_combination_train_y)
end = time.time()
print("综合采样时间：", end-start)

综合采样时间：8420.995375156402588


SMOTETomek：先使用过采样，扩大样本后再对处在胶着状态的点用 Tomek Link 法进行删除，有时候甚至连 Tomek Link 都不用，直接把离得近的对全部删除，因为在进行过采样后，0 和 1 的样本量已经达到了 1：1。

In [24]:
X_train_resample.shape

(452270, 132)

In [25]:
data = pd.concat([X_train_resample,y_train_resample], axis = 1)

In [26]:
# 保留训练集SMOTETomek后的结果
data.to_csv("C:/data/processing_data/dataset/all_train_resample.csv")

In [27]:
data.head(10)

Unnamed: 0,PREV_DAYS_DECISION_MEAN,PREV_CNT_PAYMENT_MEAN,POS_COUNT,LSTM_32,LSTM_31,LSTM_27,LSTM_21,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,CLOSED_DAYS_CREDIT_MIN,CLOSED_DAYS_CREDIT_MAX,...,APPROVED_AMT_APPLICATION_MEAN,APPROVED_AMT_APPLICATION_MAX,APARTMENTS_MODE,AMT_GOODS_PRICE,ACTIVE_DAYS_CREDIT_VAR,ACTIVE_DAYS_CREDIT_UPDATE_MEAN,ACTIVE_DAYS_CREDIT_ENDDATE_MIN,ACTIVE_AMT_CREDIT_SUM_SUM,ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN,TARGET
0,-789.5,29.0,17.0,0.996582,1.125977,1.572266,1.404297,-1880.0,-2276.0,-319.0,...,329150.25,450000.0,0.126953,-0.225708,1153778.0,-167.5,-2320.0,787788.0,0.0,0
1,-655.0,6.0,7.0,0.780762,0.662109,0.963867,0.953125,-560.0,0.0,0.0,...,170955.0,170955.0,-0.593262,1.587891,0.0,0.0,0.0,0.0,0.0,0
2,-235.875,11.335938,12.0,0.786621,0.630859,0.898926,0.912109,-221.75,0.0,0.0,...,106645.5,130050.0,2.609375,0.103027,0.0,0.0,0.0,0.0,0.0,0
3,-1065.0,21.0,25.0,0.812012,0.833008,1.385742,1.238281,-1365.0,-1684.0,-896.0,...,88106.625,180000.0,0.026871,-1.174805,80944.34,-24.671875,94.0,1237500.0,0.0,0
4,-190.0,8.0,11.0,0.538574,0.262695,0.626465,0.589355,-199.125,0.0,0.0,...,66813.75,70906.5,-0.593262,-0.201294,0.0,0.0,0.0,0.0,0.0,0
5,-208.0,6.0,7.0,1.203125,1.353516,1.355469,1.292969,-112.8125,0.0,0.0,...,129600.0,129600.0,-0.150391,3.414062,0.0,0.0,0.0,0.0,0.0,0
6,-2070.0,9.664062,30.0,1.09082,1.101562,1.563477,1.451172,-1551.0,-2630.0,-317.0,...,151127.09375,495000.0,0.292236,0.37085,2690.333,-20.671875,1533.0,4020300.0,93.0,0
7,-765.5,15.0,17.0,0.859863,0.748535,0.966797,1.017578,-602.0,-1524.0,-1014.0,...,94050.0,157500.0,-0.593262,-0.603027,4.5,-19.5,692.0,520785.0,0.0,0
8,-939.5,7.332031,26.0,0.768555,0.80957,1.185547,1.091797,-853.5,-961.0,-882.0,...,68280.0,90000.0,-0.593262,-1.174805,0.0,-30.0,362.0,165595.9,0.0,0
9,-1551.0,24.0,13.0,0.674316,0.700684,1.009766,1.008789,-1367.0,-2592.0,-567.0,...,171343.796875,171343.796875,-0.593262,-0.237793,396050.0,-22.0,1708.0,3429000.0,0.0,0


In [28]:
select_combination_val.shape

(30751, 133)

In [29]:
import gc
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from sklearn import metrics
def val_lightgbm(train, val):
    
    print("Starting LightGBM. Train shape: {}, val shape: {}".format(train.shape, val.shape))
    feats = [f for f in train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    
    gbm = lgb.LGBMClassifier(
            boosting_type='gbdt',
            num_leaves=106,
            max_depth=-1,
            learning_rate=0.012,
            n_estimators= 10000,
            objective='binary',
            class_weight=None,
            min_split_gain=0.0,
            min_child_weight=0.001,
            min_child_samples=390,
            subsample=0.717756,
            subsample_freq=1,
            colsample_bytree=0.614937,
            reg_alpha=0.512999,
            reg_lambda=0.382687,
            subsample_for_bin=80000,
            random_state=42,
            n_jobs=-1
        )
    X_train = train[feats].to_numpy()
    y_train = train['TARGET'].values  
    X_val = val[feats].to_numpy()                   
    y_val = val['TARGET'].values 
        
    gbm.fit(X_train,y_train)
    gbm_y_pre_train = gbm.predict(X_train)
    gbm_y_pre= gbm.predict(X_val)
    gbm_y_proba_train = gbm.predict_proba(X_train)
    gbm_y_proba= gbm.predict_proba(X_val)
    fpr,tpr,thresholds = roc_curve(y_val,gbm_y_proba[:,1])
    ks = max(tpr-fpr)
    
    print("==============================train==================================")
    gbm_accuracy_score_train = accuracy_score(y_train,gbm_y_pre_train)
    
    gbm_auc_train = roc_auc_score(y_train,gbm_y_proba_train[:,1])
    
    print('gbm_accuracy_score: %f,\ngbm_auc: %f'
          %(gbm_accuracy_score_train,
            gbm_auc_train))
    
    print("---------------------------------val---------------------------------")
    gbm_accuracy_score= accuracy_score(y_val,gbm_y_pre)
  
    gbm_auc = roc_auc_score(y_val,gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
        
    
            gbm_auc,
            ks))
    print("======================================================================")
    importance_df = pd.DataFrame()              
    importance_df["feature"] = feats
    importance_df["importance"] = gbm.feature_importances_
    
    
    return gbm,importance_df

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout
#     plt.savefig('lgbm_importances01.png')

In [30]:
start = time.time()
gbm, feature_imp = val_lightgbm(data, select_ combination_val)
end = time.time()
print("训练时间：", end -start)

Starting LightGBM. Train shape: (452270, 133), val shape: (30751, 133)
gbm_accuracy_score: 0.999655,
gbm_auc: 1.000000
---------------------------------val---------------------------------
gbm_accuracy_score: 0.931677,
gbm_auc: 0.827763,
gbm_ks:0.496807
训练时间： 914.4859137535095


In [31]:

def lightgbm_test(gbm, test):
    feats = [f for f in test.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] # 固定列了

    gbm_y_pre = gbm.predict(test[feats].to_numpy())
    gbm_y_proba= gbm.predict_proba(test[feats])  # 验证训练模型没有这个'Booster' object has no attribute 'predict_proba'属性
    fpr,tpr,thresholds = roc_curve(test["TARGET"],gbm_y_proba[:,1])
    ks = max(tpr-fpr)


    gbm_accuracy_score=accuracy_score(test["TARGET"],gbm_y_pre)

    gbm_auc=roc_auc_score(test["TARGET"],gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
            gbm_auc,
            ks))



In [32]:
lightgbm_test(gbm, select_combination_test)

gbm_accuracy_score: 0.926311,
gbm_auc: 0.802134,
gbm_ks:0.450672


In [33]:
lightgbm_test(gbm, select_combination_test)

gbm_accuracy_score: 0.926311,
gbm_auc: 0.802134,
gbm_ks:0.450672


In [34]:
lightgbm_test(gbm, select_combination_test)

gbm_accuracy_score: 0.926311,
gbm_auc: 0.802134,
gbm_ks:0.450672


In [None]:
# import joblib
# joblib.dump(gbm, 'C:/data/processing_data/dataset/all_data_modle.pkl')
# joblib.load('modle.pkl') 加载模型

In [None]:
# !pip install joblib

# 去掉特征提取结果

In [1]:
import time
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [2]:
all_data = pd.read_csv("C:/data/processing_data/all_data.csv")
all_data = reduce_mem_usage(all_data)

Memory usage of dataframe: 1081.56 MB
Memory usage after optimization: 340.48 MB
Decreased by 68.5%


In [3]:
all_data.shape

(307511, 461)

In [4]:
columns = all_data.columns.tolist()

In [5]:
all_data.head(10)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,LSTM_19,LSTM_20,LSTM_21,LSTM_23,LSTM_25,LSTM_26,LSTM_27,LSTM_30,LSTM_31,LSTM_32
0,100002,-0.324463,-0.717773,0.664551,-0.577637,0.14209,-0.478027,-0.166016,-0.505859,-0.149658,...,0.088806,0.164062,0.484619,0.262939,0.162842,0.139404,0.488281,0.624512,0.304199,0.323242
1,100003,-0.324463,-0.717773,-1.504883,-0.577637,0.426758,1.725586,0.592773,1.600586,-1.25293,...,0.0,0.532715,1.517578,0.62207,0.825195,0.0,1.728516,2.693359,1.545898,1.300781
2,100004,3.082031,1.392578,0.664551,-0.577637,-0.427246,-1.15332,-1.404297,-1.089844,-0.783203,...,0.0,0.356934,1.283203,0.556152,0.493408,0.729492,1.445312,2.021484,0.999512,0.88916
3,100006,-0.324463,-0.717773,0.664551,-0.577637,-0.142578,-0.711426,0.177979,-0.651855,-0.929199,...,0.0,0.271973,0.982422,0.650391,0.457764,0.0,1.086914,1.581055,0.726074,0.801758
4,100007,-0.324463,-0.717773,0.664551,-0.577637,-0.199463,-0.213745,-0.361572,-0.067383,0.562988,...,0.0,0.200684,1.047852,0.322754,0.527832,0.0,1.129883,1.547852,0.803223,0.759277
5,100008,-0.324463,-0.717773,0.664551,-0.577637,-0.294434,-0.269531,0.02829,-0.225708,1.079102,...,0.0,0.338135,1.052734,0.354248,0.462158,0.621094,1.134766,1.626953,0.891602,0.897949
6,100009,-0.324463,1.392578,0.664551,0.807129,0.009285,2.388672,0.979004,2.318359,1.079102,...,0.244629,0.418457,1.654297,0.724121,0.611816,0.491943,1.828125,1.81543,1.048828,1.099609
7,100010,-0.324463,1.392578,0.664551,-0.577637,0.806152,2.3125,1.032227,2.683594,-1.283203,...,0.0,0.393555,1.265625,0.714355,0.713867,0.540527,1.34668,2.126953,1.164062,1.132812
8,100011,-0.324463,-0.717773,0.664551,-0.577637,-0.237427,1.044922,0.463623,1.016602,-0.161865,...,0.0,0.3125,1.094727,0.205688,0.506836,0.751953,1.060547,1.671875,0.83252,0.958496
9,100012,3.082031,-0.717773,0.664551,-0.577637,-0.142578,-0.482178,-0.473145,-0.359619,-0.085693,...,0.0,0.305176,1.082031,0.746582,0.452881,0.0,1.239258,1.71875,0.853027,0.817383


In [6]:
columns 

['SK_ID_CURR',
 'NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_2',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'LANDAREA_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'LANDAREA_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'ELE

In [7]:
droplist = [i for i in columns if "LSTM_" in i]

In [8]:
droplist

['LSTM_3',
 'LSTM_4',
 'LSTM_8',
 'LSTM_10',
 'LSTM_12',
 'LSTM_14',
 'LSTM_19',
 'LSTM_20',
 'LSTM_21',
 'LSTM_23',
 'LSTM_25',
 'LSTM_26',
 'LSTM_27',
 'LSTM_30',
 'LSTM_31',
 'LSTM_32']

In [9]:
all_data.to_csv("C:/data/data/lstm_data.csv")

In [10]:
drop_lstm_data = all_data.drop(columns = droplist, axis = 1)

In [10]:
drop_lstm_data.shape

(307511, 445)

In [11]:
drop_lstm_data.to_csv("C:/data/data/data.csv")

In [11]:
def split_train(data,test_ratio):
   #  data = data.iloc[0:20000,:]
    np.random.seed(0)  # 设置随机种子确保实验可重复
    shuffled_indices=np.random.permutation(len(data)) # 随机片
    test_set_size=int(len(data)*test_ratio)
    test_indices =shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [12]:
drop_lstm_data_train, drop_lstm_data_valtest = split_train(drop_lstm_data, 0.2)
drop_lstm_data_val,drop_lstm_data_test = split_train(drop_lstm_data_valtest, 0.5)
drop_lstm_data_train.to_csv("C:/data/processing_data/dataset/drop_lstm_data_train.csv")
drop_lstm_data_val.to_csv("C:/data/processing_data/dataset/drop_lstm_data_val.csv")
drop_lstm_data_test.to_csv("C:/data/processing_data/dataset/drop_lstm_data_test.csv")

In [13]:
from sklearn.feature_selection import SelectKBest,chi2,RFE,SelectFromModel
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 采用6种特征选择方法，输入特征选择结果
def select_feature(data, num):
    X = data.drop(['TARGET'], axis=1)
    y = data.TARGET
    feature_name = X.columns.tolist()
    
    start = time.time()
    # 过滤法，皮尔逊相关系数
    cor_list = []
    # 计算与目标的相关系数
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # 空值使用0代替
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # 选中的名字
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num:]].columns.tolist()
    # 选中为T
    cor_support = [True if i in cor_feature else False for i in feature_name]
    end1 =time.time()
    print("过滤法-皮尔逊相关系数特征选择所用时间：", end1 - start)
    print(str(len(cor_feature)), 'selected features')
    
    # 过滤法-卡方检验
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    end2 = time.time()
    print("过滤法-卡方检验特征选择所用时间：", end2 - end1)
    print(str(len(chi_feature)), 'selected features')
    
    # 包裹式-LR
    LR_selector = RFE(estimator=LogisticRegression(solver='liblinear',max_iter=500), n_features_to_select=num, step=50, verbose=5)
    LR_selector.fit(X, y)
    LR_support = LR_selector.get_support()
    LR_feature = X.loc[:,LR_support].columns.tolist()  # list
    end3 = time.time()
    print("包裹式-LR特征选择所用时间：", end3 - end2)
    print(str(len(LR_feature)), 'selected features')
    
    # 包裹式-RF
    RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=num, step=50, verbose=5)
    RF_selector.fit(X, y)
    RF_support = RF_selector.get_support()
    RF_feature = X.loc[:,RF_support].columns.tolist()
    end4 = time.time()
    print("包裹式-LR特征选择所用时间：", end4 - end3)
    print(str(len(RF_feature)), 'selected features')
    
    # 嵌入式-LR
    embeded_lr_selector = SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=-np.inf, max_features=num)
    embeded_lr_selector.fit(X_norm, y)
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    end5 = time.time()
    print("嵌入式-LR特征选择所用时间：", end5 - end4)
    print(str(len(embeded_lr_feature)), 'selected features')
    
    # 嵌入式-RF
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold=-np.inf, max_features=200)
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    end6 = time.time()
    print("嵌入式-LR特征选择所用时间：", end6 - end5)

    print(str(len(embeded_rf_feature)), 'selected features')
    
    
    # put all selection together
    feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE_LR':LR_support, 'RFE_RF':RF_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
    # 统计结果 
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    # display the top 200
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    feature_selection_df = feature_selection_df.iloc[:num,]
    
    selected_df = feature_selection_df[feature_selection_df['Total'] > 3]
    combination_feature = selected_df.Feature.tolist()
    combination_feature_dataframe = combination_feature + [" "] * (200 - len(combination_feature)) # 不够两百，空值填充
    
    dataframe_select = pd.DataFrame({'Pearson':cor_feature, 'Chi-2':chi_feature, 'RFE_LR':LR_feature, 'RFE_RF':RF_feature, 'embedding_LR':embeded_lr_feature,
                                    'embedding_RF':embeded_rf_feature, 'combination': combination_feature_dataframe})
    
    # 返回投票结果，以及返回选择后的特征列表
    return feature_selection_df,dataframe_select
    # 组合特征选择

In [14]:
drop_lstm_feture_selction_df,drop_lstm_dataframe_select = select_feature(drop_lstm_data_train, 200)

过滤法-皮尔逊相关系数特征选择所用时间： 1.7650914192199707
200 selected features
过滤法-卡方检验特征选择所用时间： 2.0914084911346436
200 selected features
Fitting estimator with 444 features.
Fitting estimator with 394 features.
Fitting estimator with 344 features.
Fitting estimator with 294 features.
Fitting estimator with 244 features.
包裹式-LR特征选择所用时间： 1360.8490874767303
200 selected features
Fitting estimator with 444 features.
Fitting estimator with 394 features.
Fitting estimator with 344 features.
Fitting estimator with 294 features.
Fitting estimator with 244 features.
包裹式-LR特征选择所用时间： 1647.4414615631104
200 selected features
嵌入式-LR特征选择所用时间： 323.3498065471649
200 selected features
嵌入式-LR特征选择所用时间： 282.8331153392792
200 selected features


In [15]:
pd.set_option('display.max_rows', None) # 可以显示所有
drop_lstm_feture_selction_df

Unnamed: 0,Feature,Pearson,Chi-2,RFE_LR,RFE_RF,Logistics,Random Forest,Total
1,PREV_DAYS_DECISION_MEAN,True,True,True,True,True,True,6
2,PREV_CODE_REJECT_REASON_XAP_MEAN,True,True,True,True,True,True,6
3,PREV_CNT_PAYMENT_SUM,True,True,True,True,True,True,6
4,PREV_CNT_PAYMENT_MEAN,True,True,True,True,True,True,6
5,POS_MONTHS_BALANCE_SIZE,True,True,True,True,True,True,6
6,POS_MONTHS_BALANCE_MEAN,True,True,True,True,True,True,6
7,POS_COUNT,True,True,True,True,True,True,6
8,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,True,True,True,True,True,True,6
9,CLOSED_DAYS_CREDIT_VAR,True,True,True,True,True,True,6
10,CLOSED_DAYS_CREDIT_MEAN,True,True,True,True,True,True,6


In [16]:
drop_lstm_feture_selction_df.to_csv("C:/data/processing_data/dataset/drop_lstm_feture_selction.csv")

In [17]:
drop_lstm_dataframe_select

Unnamed: 0,Pearson,Chi-2,RFE_LR,RFE_RF,embedding_LR,embedding_RF,combination
0,APPROVED_AMT_DOWN_PAYMENT_MEAN,NAME_CONTRACT_TYPE,SK_ID_CURR,SK_ID_CURR,FLAG_OWN_CAR,SK_ID_CURR,PREV_DAYS_DECISION_MEAN
1,ACTIVE_AMT_CREDIT_SUM_MEAN,FLAG_OWN_CAR,BURO_DAYS_CREDIT_MIN,AMT_INCOME_TOTAL,CNT_CHILDREN,AMT_INCOME_TOTAL,PREV_CODE_REJECT_REASON_XAP_MEAN
2,NAME_INCOME_TYPE_State servant,AMT_CREDIT,BURO_DAYS_CREDIT_MAX,AMT_CREDIT,AMT_CREDIT,AMT_CREDIT,PREV_CNT_PAYMENT_SUM
3,PREV_AMT_ANNUITY_MAX,AMT_GOODS_PRICE,BURO_DAYS_CREDIT_MEAN,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,PREV_CNT_PAYMENT_MEAN
4,LANDAREA_MODE,REGION_POPULATION_RELATIVE,BURO_DAYS_CREDIT_VAR,AMT_GOODS_PRICE,AMT_GOODS_PRICE,AMT_GOODS_PRICE,POS_MONTHS_BALANCE_SIZE
5,PREV_NAME_PAYMENT_TYPE_Cash through the bank_MEAN,DAYS_BIRTH,BURO_DAYS_CREDIT_ENDDATE_MIN,REGION_POPULATION_RELATIVE,REGION_POPULATION_RELATIVE,REGION_POPULATION_RELATIVE,POS_MONTHS_BALANCE_MEAN
6,BURO_AMT_CREDIT_SUM_MAX,DAYS_ID_PUBLISH,BURO_DAYS_CREDIT_ENDDATE_MAX,DAYS_BIRTH,DAYS_BIRTH,DAYS_BIRTH,POS_COUNT
7,PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN,FLAG_EMP_PHONE,BURO_DAYS_CREDIT_ENDDATE_MEAN,DAYS_EMPLOYED,DAYS_EMPLOYED,DAYS_EMPLOYED,INSTAL_DAYS_ENTRY_PAYMENT_MEAN
8,ORGANIZATION_TYPE_Business Entity Type 3,FLAG_WORK_PHONE,BURO_DAYS_CREDIT_UPDATE_MEAN,DAYS_REGISTRATION,DAYS_REGISTRATION,DAYS_REGISTRATION,CLOSED_DAYS_CREDIT_VAR
9,LANDAREA_AVG,FLAG_PHONE,BURO_AMT_CREDIT_MAX_OVERDUE_MEAN,DAYS_ID_PUBLISH,DAYS_ID_PUBLISH,DAYS_ID_PUBLISH,CLOSED_DAYS_CREDIT_MEAN


In [18]:
drop_lstm_dataframe_select.to_csv("C:/data/processing_data/dataset/drop_lstm_dataframe_select.csv")

In [13]:
drop_lstm_data_train = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_train.csv")
drop_lstm_data_val = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_val.csv")
drop_lstm_data_test = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_test.csv")

In [14]:
drop_lstm_dataframe_select = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_dataframe_select.csv")

In [15]:
drop_lstm_combination_feature = [name for name in drop_lstm_dataframe_select['combination'].tolist() if len(name) > 2] # >2 过滤掉空字符
drop_lstm_combination_feature.append("TARGET")

In [16]:
drop_lstm_select_combination_train = drop_lstm_data_train.loc[:, drop_lstm_combination_feature]
drop_lstm_select_combination_train

Unnamed: 0,PREV_DAYS_DECISION_MEAN,PREV_CODE_REJECT_REASON_XAP_MEAN,PREV_CNT_PAYMENT_SUM,PREV_CNT_PAYMENT_MEAN,POS_MONTHS_BALANCE_SIZE,POS_MONTHS_BALANCE_MEAN,POS_COUNT,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,CLOSED_DAYS_CREDIT_VAR,CLOSED_DAYS_CREDIT_MEAN,...,APPROVED_AMT_APPLICATION_MIN,APPROVED_AMT_APPLICATION_MEAN,APPROVED_AMT_APPLICATION_MAX,APARTMENTS_MEDI,APARTMENTS_AVG,ACTIVE_DAYS_CREDIT_UPDATE_MEAN,ACTIVE_DAYS_CREDIT_ENDDATE_MIN,ACTIVE_AMT_CREDIT_SUM_SUM,ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN,TARGET
0,-720.5,1.0000,18.0,18.000,19.0,-38.00,19.0,-1123.0,0.000,0.0,...,80995.5,80995.500,80995.5,1.36000,1.35200,0.00,0.0,0.0,0.000,0
1,-945.0,0.8000,120.0,30.000,45.0,-46.50,45.0,-1481.0,236306.250,-1360.0,...,18724.5,97741.500,225000.0,-0.01892,-0.02235,-258.00,-842.0,370350.0,0.000,0
2,-1260.0,1.0000,116.0,14.500,108.0,-34.66,108.0,-1038.0,0.000,-1153.0,...,25380.0,119352.375,229500.0,0.68160,0.67530,-28.00,1007.0,1146685.2,2327.085,0
3,-499.0,0.8570,10.0,5.000,11.0,-7.00,11.0,-236.4,308582.280,-1474.0,...,95373.0,385186.500,675000.0,1.64100,1.63000,-24.00,493.0,396765.0,0.000,0
4,-1557.0,0.8570,70.0,11.664,63.0,-48.66,63.0,-1439.0,0.000,0.0,...,44955.0,155761.200,337500.0,-0.60100,-0.60250,0.00,0.0,0.0,0.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246004,-1578.0,1.0000,22.0,7.332,22.0,-59.50,22.0,-1445.0,1448700.600,-1458.0,...,0.0,76708.500,180000.0,-0.60100,-0.60250,-67.00,897.0,3995595.0,0.000,0
246005,-1304.0,0.7856,352.0,25.140,163.0,-31.40,163.0,-964.0,0.000,-128.0,...,51668.1,189968.280,381226.6,-0.60100,-0.60250,-17.00,642.0,450000.0,0.000,0
246006,0.0,0.0000,0.0,0.000,0.0,0.00,0.0,0.0,107654.914,-1733.0,...,0.0,0.000,0.0,-0.57900,-0.58060,-28.00,1048.0,450000.0,0.000,0
246007,0.0,0.0000,0.0,0.000,0.0,0.00,0.0,0.0,427754.500,-1122.0,...,0.0,0.000,0.0,-0.60100,-0.60250,-38.66,880.0,270000.0,0.000,0


In [17]:
drop_lstm_select_combination_val = drop_lstm_data_val.loc[:, drop_lstm_combination_feature]

In [18]:
drop_lstm_select_combination_test = drop_lstm_data_test.loc[:, drop_lstm_combination_feature]

In [19]:
drop_lstm_select_combination_train_X = drop_lstm_select_combination_train.drop(["TARGET"], axis = 1)
drop_lstm_select_combination_train_y = drop_lstm_select_combination_train["TARGET"]
drop_lstm_select_combination_train_X.shape

(246009, 140)

In [21]:
import imblearn
from imblearn.combine import SMOTETomek
start = time.time()
smt = SMOTETomek()
smt.fit(drop_lstm_select_combination_train_X, drop_lstm_select_combination_train_y)
X_train_drop_lstm_resample, y_train__drop_lstm_resample = smt.fit_resample(drop_lstm_select_combination_train_X, drop_lstm_select_combination_train_y)
end = time.time()
print("综合采样时间：", end-start)

综合采样时间：8327.239233732223511


In [22]:
drop_lstm_resample_data = pd.concat([X_train_drop_lstm_resample, y_train__drop_lstm_resample], axis = 1)
drop_lstm_resample_data.to_csv("C:/data/processing_data/dataset/drop_lstm_train_resample.csv")

In [23]:
import gc
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from sklearn import metrics
def val_lightgbm(train, val):
    
    print("Starting LightGBM. Train shape: {}, val shape: {}".format(train.shape, val.shape))
    feats = [f for f in train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    
    gbm = lgb.LGBMClassifier(
            boosting_type='gbdt',
            num_leaves=106,
            max_depth=-1,
            learning_rate=0.012,
            n_estimators= 10000,
            objective='binary',
            class_weight=None,
            min_split_gain=0.0,
            min_child_weight=0.001,
            min_child_samples=390,
            subsample=0.717756,
            subsample_freq=1,
            colsample_bytree=0.614937,
            reg_alpha=0.512999,
            reg_lambda=0.382687,
            subsample_for_bin=80000,
            random_state=42,
            n_jobs=-1
        )
    X_train = train[feats].to_numpy()
    y_train = train['TARGET'].values  
    X_val = val[feats].to_numpy()                   
    y_val = val['TARGET'].values 
        
    gbm.fit(X_train,y_train)
    gbm_y_pre_train = gbm.predict(X_train)
    gbm_y_pre= gbm.predict(X_val)
    gbm_y_proba_train = gbm.predict_proba(X_train)
    gbm_y_proba= gbm.predict_proba(X_val)
    fpr,tpr,thresholds = roc_curve(y_val,gbm_y_proba[:,1])
    ks = max(tpr-fpr)
    
    print("==============================train==================================")
    gbm_accuracy_score_train = accuracy_score(y_train,gbm_y_pre_train)
    
    gbm_auc_train = roc_auc_score(y_train,gbm_y_proba_train[:,1])
    
    print('gbm_accuracy_score: %f,\ngbm_auc: %f'
          %(gbm_accuracy_score_train,
            gbm_auc_train))
    
    print("---------------------------------val---------------------------------")
    gbm_accuracy_score= accuracy_score(y_val,gbm_y_pre)
  
    gbm_auc = roc_auc_score(y_val,gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
        
    
            gbm_auc,
            ks))
    print("======================================================================")
    importance_df = pd.DataFrame()              
    importance_df["feature"] = feats
    importance_df["importance"] = gbm.feature_importances_
    
    
    return gbm,importance_df

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout

In [24]:
start = time.time()
gbm, feature_imp = val_lightgbm(drop_lstm_resample_data,drop_lstm_select_combination_val)
end = time.time()
print("训练时间：", end -start)

Starting LightGBM. Train shape: (452198, 141), val shape: (30751, 141)
gbm_accuracy_score: 0.998910,
gbm_auc: 1.000000
---------------------------------val---------------------------------
gbm_accuracy_score: 0.918929,
gbm_auc: 0.764486,
gbm_ks:0.396506
训练时间： 939.7994265556335


In [25]:
def lightgbm_test(gbm, test):
    feats = [f for f in test.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] # 固定列了

    gbm_y_pre = gbm.predict(test[feats].to_numpy())
    gbm_y_proba= gbm.predict_proba(test[feats])  # 验证训练模型没有这个'Booster' object has no attribute 'predict_proba'属性
    fpr,tpr,thresholds = roc_curve(test["TARGET"],gbm_y_proba[:,1])
    ks = max(tpr-fpr)


    gbm_accuracy_score=accuracy_score(test["TARGET"],gbm_y_pre)

    gbm_auc=roc_auc_score(test["TARGET"],gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
            gbm_auc,
            ks))

In [26]:
lightgbm_test(gbm, drop_lstm_select_combination_test)

gbm_accuracy_score: 0.921076,
gbm_auc: 0.752975,
gbm_ks:0.374338


In [27]:
import joblib
joblib.dump(gbm, 'C:/data/processing_data/dataset/drop_lstm_data_modle.pkl')

['C:/data/processing_data/dataset/drop_lstm_data_modle.pkl']

# 去掉特征选择部分

In [10]:
import time
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [11]:
all_data = pd.read_csv("C:/data/processing_data/all_data.csv")
all_data = reduce_mem_usage(all_data)

Memory usage of dataframe: 1081.56 MB
Memory usage after optimization: 340.48 MB
Decreased by 68.5%


In [12]:
all_data.shape

(307511, 461)

In [13]:
def split_train(data,test_ratio):
   #  data = data.iloc[0:20000,:]
    np.random.seed(0)  # 设置随机种子确保实验可重复
    shuffled_indices=np.random.permutation(len(data)) # 随机片
    test_set_size=int(len(data)*test_ratio)
    test_indices =shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [14]:
no_select_all_data_train, no_select_all_data_valtest = split_train(all_data, 0.2)
no_select_all_data_val,no_select_all_data_test = split_train( no_select_all_data_valtest, 0.5)
no_select_all_data_train.to_csv("C:/data/processing_data/dataset/no_select_all_data_train.csv")
no_select_all_data_val.to_csv("C:/data/processing_data/dataset/no_select_all_data_val.csv")
no_select_all_data_test.to_csv("C:/data/processing_data/dataset/no_select_all_data_test.csv")

In [15]:
no_select_all_data_train_X = no_select_all_data_train.drop(["TARGET"], axis = 1)
no_select_all_data_train_y = no_select_all_data_train["TARGET"]
no_select_all_data_train_X.shape

(246009, 460)

In [16]:
import imblearn
from imblearn.combine import SMOTETomek
start = time.time()
smt = SMOTETomek()
smt.fit(no_select_all_data_train_X, no_select_all_data_train_y)
X_train_drop_lstm_resample, y_train__drop_lstm_resample = smt.fit_resample(no_select_all_data_train_X, no_select_all_data_train_y)
end = time.time()
print("综合采样时间：", end-start)

综合采样时间：14314.96342658996582


In [17]:
no_select_all_data_resample_data = pd.concat([X_train_drop_lstm_resample, y_train__drop_lstm_resample], axis = 1)
no_select_all_data_resample_data.to_csv("C:/data/processing_data/dataset/no_select_all_data_train_resample.csv")

In [18]:
import gc
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from sklearn import metrics
def val_lightgbm(train, val):
    
    print("Starting LightGBM. Train shape: {}, val shape: {}".format(train.shape, val.shape))
    feats = [f for f in train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    
    gbm = lgb.LGBMClassifier(
            boosting_type='gbdt',
            num_leaves=106,
            max_depth=-1,
            learning_rate=0.012,
            n_estimators= 10000,
            objective='binary',
            class_weight=None,
            min_split_gain=0.0,
            min_child_weight=0.001,
            min_child_samples=390,
            subsample=0.717756,
            subsample_freq=1,
            colsample_bytree=0.614937,
            reg_alpha=0.512999,
            reg_lambda=0.382687,
            subsample_for_bin=80000,
            random_state=42,
            n_jobs=-1
        )
    X_train = train[feats].to_numpy()
    y_train = train['TARGET'].values  
    X_val = val[feats].to_numpy()                   
    y_val = val['TARGET'].values 
        
    gbm.fit(X_train,y_train)
    gbm_y_pre_train = gbm.predict(X_train)
    gbm_y_pre= gbm.predict(X_val)
    gbm_y_proba_train = gbm.predict_proba(X_train)
    gbm_y_proba= gbm.predict_proba(X_val)
    fpr,tpr,thresholds = roc_curve(y_val,gbm_y_proba[:,1])
    ks = max(tpr-fpr)
    
    print("==============================train==================================")
    gbm_accuracy_score_train = accuracy_score(y_train,gbm_y_pre_train)
    
    gbm_auc_train = roc_auc_score(y_train,gbm_y_proba_train[:,1])
    
    print('gbm_accuracy_score: %f,\ngbm_auc: %f'
          %(gbm_accuracy_score_train,
            gbm_auc_train))
    
    print("---------------------------------val---------------------------------")
    gbm_accuracy_score= accuracy_score(y_val,gbm_y_pre)
  
    gbm_auc = roc_auc_score(y_val,gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
        
    
            gbm_auc,
            ks))
    print("======================================================================")
    importance_df = pd.DataFrame()              
    importance_df["feature"] = feats
    importance_df["importance"] = gbm.feature_importances_
    
    
    return gbm,importance_df

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout

In [20]:
start = time.time()
gbm, feature_imp = val_lightgbm(no_select_all_data_resample_data,no_select_all_data_val)
end = time.time()
print("训练时间：", end -start)

Starting LightGBM. Train shape: (452456, 461), val shape: (30751, 461)
gbm_accuracy_score: 0.999976,
gbm_auc: 1.000000
---------------------------------val---------------------------------
gbm_accuracy_score: 0.919125,
gbm_auc: 0.788814,
gbm_ks:0.428678
训练时间： 1993.1008563041687


In [21]:
def lightgbm_test(gbm, test):
    feats = [f for f in test.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] # 固定列了

    gbm_y_pre = gbm.predict(test[feats].to_numpy())
    gbm_y_proba= gbm.predict_proba(test[feats])  # 验证训练模型没有这个'Booster' object has no attribute 'predict_proba'属性
    fpr,tpr,thresholds = roc_curve(test["TARGET"],gbm_y_proba[:,1])
    ks = max(tpr-fpr)


    gbm_accuracy_score=accuracy_score(test["TARGET"],gbm_y_pre)

    gbm_auc=roc_auc_score(test["TARGET"],gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
            gbm_auc,
            ks))

In [22]:
lightgbm_test(gbm, no_select_all_data_test)

gbm_accuracy_score: 0.918507,
gbm_auc: 0.785006,
gbm_ks:0.428309


In [23]:
import joblib
joblib.dump(gbm, 'C:/data/processing_data/dataset/no_select_all_data_modle.pkl')

['C:/data/processing_data/dataset/no_select_all_data_modle.pkl']

# 去掉两个部分

In [1]:
import time
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

def split_train(data,test_ratio):
   #  data = data.iloc[0:20000,:]
    np.random.seed(0)  # 设置随机种子确保实验可重复
    shuffled_indices=np.random.permutation(len(data)) # 随机片
    test_set_size=int(len(data)*test_ratio)
    test_indices =shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]


In [2]:
all_data = pd.read_csv("C:/data/processing_data/all_data.csv")
all_data = reduce_mem_usage(all_data)

Memory usage of dataframe: 1081.56 MB
Memory usage after optimization: 340.48 MB
Decreased by 68.5%


In [3]:
columns = all_data.columns.tolist()

In [4]:
droplist = [i for i in columns if "LSTM_" in i]

In [5]:
drop_lstm_data = all_data.drop(columns = droplist, axis = 1)

In [6]:
drop_lstm_data.shape

(307511, 445)

In [7]:
drop_lstm_data_train, drop_lstm_data_valtest = split_train(drop_lstm_data, 0.2)
drop_lstm_data_val,drop_lstm_data_test = split_train(drop_lstm_data_valtest, 0.5)

In [8]:
drop_lstm_train_X = drop_lstm_data_train.drop(["TARGET"], axis = 1)
drop_lstm_train_y = drop_lstm_data_train["TARGET"]
drop_lstm_train_X.shape

(246009, 444)

In [9]:
import imblearn
from imblearn.combine import SMOTETomek
start = time.time()
smt = SMOTE()
smt.fit(drop_lstm_train_X, drop_lstm_train_y)
X_train_drop_lstm_resample, y_train__drop_lstm_resample = smt.fit_resample(drop_lstm_train_X, drop_lstm_train_y)
end = time.time()
print("综合采样时间：", end-start)

综合采样时间：12215.759775400161743


In [10]:
# drop_lstm_resample_data = pd.concat([X_train_drop_lstm_resample, y_train__drop_lstm_resample], axis = 1)
# drop_lstm_resample_data.to_csv("C:/data/processing_data/dataset/drop_lstm_no_select_train_resample.csv")

In [11]:
import gc
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from sklearn import metrics
def val_lightgbm(train, val):
    
    print("Starting LightGBM. Train shape: {}, val shape: {}".format(train.shape, val.shape))
    feats = [f for f in train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    
    gbm = lgb.LGBMClassifier(
            boosting_type='gbdt',
            num_leaves=106,
            max_depth=-1,
            learning_rate=0.012,
            n_estimators= 10000,
            objective='binary',
            class_weight=None,
            min_split_gain=0.0,
            min_child_weight=0.001,
            min_child_samples=390,
            subsample=0.717756,
            subsample_freq=1,
            colsample_bytree=0.614937,
            reg_alpha=0.512999,
            reg_lambda=0.382687,
            subsample_for_bin=80000,
            random_state=42,
            n_jobs=-1
        )
    X_train = train[feats].to_numpy()
    y_train = train['TARGET'].values  
    X_val = val[feats].to_numpy()                   
    y_val = val['TARGET'].values 
        
    gbm.fit(X_train,y_train)
    gbm_y_pre_train = gbm.predict(X_train)
    gbm_y_pre= gbm.predict(X_val)
    gbm_y_proba_train = gbm.predict_proba(X_train)
    gbm_y_proba= gbm.predict_proba(X_val)
    fpr,tpr,thresholds = roc_curve(y_val,gbm_y_proba[:,1])
    ks = max(tpr-fpr)
    
    print("==============================train==================================")
    gbm_accuracy_score_train = accuracy_score(y_train,gbm_y_pre_train)
    
    gbm_auc_train = roc_auc_score(y_train,gbm_y_proba_train[:,1])
    
    print('gbm_accuracy_score: %f,\ngbm_auc: %f'
          %(gbm_accuracy_score_train,
            gbm_auc_train))
    
    print("---------------------------------val---------------------------------")
    gbm_accuracy_score= accuracy_score(y_val,gbm_y_pre)
  
    gbm_auc = roc_auc_score(y_val,gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
        
    
            gbm_auc,
            ks))
    print("======================================================================")
    importance_df = pd.DataFrame()              
    importance_df["feature"] = feats
    importance_df["importance"] = gbm.feature_importances_
    
    
    return gbm,importance_df

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout

In [12]:
start = time.time()
gbm, feature_imp = val_lightgbm(drop_lstm_resample_data,drop_lstm_data_val)
end = time.time()
print("训练时间：", end -start)


Starting LightGBM. Train shape: (452076, 445), val shape: (30751, 445)
gbm_accuracy_score:0.957069,
gbm_auc: 0.986956
---------------------------------val---------------------------------
gbm_accuracy_score: 0.921141,
gbm_auc: 0.752374,
gbm_ks:0.360665
训练时间： 2026.3907496929169


In [13]:
# drop_lstm_resample_data.to_csv("C:/data/processing_data/dataset/no_select_no_lstm_data_train.csv")

In [14]:
# drop_lstm_data_val.to_csv("C:/data/processing_data/dataset/no_select_no_lstm_data_val.csv")

In [15]:
# drop_lstm_data_test.to_csv("C:/data/processing_data/dataset/no_select_no_lstm_data_test.csv")

In [16]:
def lightgbm_test(gbm, test):
    feats = [f for f in test.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] # 固定列了

    gbm_y_pre = gbm.predict(test[feats].to_numpy())
    gbm_y_proba= gbm.predict_proba(test[feats])  # 验证训练模型没有这个'Booster' object has no attribute 'predict_proba'属性
    fpr,tpr,thresholds = roc_curve(test["TARGET"],gbm_y_proba[:,1])
    ks = max(tpr-fpr)


    gbm_accuracy_score=accuracy_score(test["TARGET"],gbm_y_pre)

    gbm_auc=roc_auc_score(test["TARGET"],gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
            gbm_auc,
            ks))

In [17]:
start = time.time()
lightgbm_test(gbm, drop_lstm_data_test)
end = time.time()

gbm_accuracy_score: 0.919124,
gbm_auc: 0.741755,
gbm_ks:0.356715


In [None]:
# feature_imp.sort_values(by="importance" , inplace=True, ascending=False) 

# 一个用户需要花费多长时间

In [15]:
import joblib
import pandas as pd
gbm = joblib.load('C:/data/processing_data/dataset/drop_lstm_data_modle.pkl')

In [16]:

def lightgbm_test(gbm, test):
    feats = [f for f in test.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] # 固定列了

    gbm_y_pre = gbm.predict(test[feats].to_numpy())
    gbm_y_proba= gbm.predict_proba(test[feats])  # 验证训练模型没有这个'Booster' object has no attribute 'predict_proba'属性
    fpr,tpr,thresholds = roc_curve(test["TARGET"],gbm_y_proba[:,1])
    ks = max(tpr-fpr)


    gbm_accuracy_score=accuracy_score(test["TARGET"],gbm_y_pre)

    gbm_auc=roc_auc_score(test["TARGET"],gbm_y_proba[:,1])
    print('gbm_accuracy_score: %f,\ngbm_auc: %f,\ngbm_ks:%f'
          %(gbm_accuracy_score,
            gbm_auc,
            ks))

In [21]:
drop_lstm_data_test = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_test.csv")

In [22]:
drop_lstm_dataframe_select = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_dataframe_select.csv")
drop_lstm_combination_feature = [name for name in drop_lstm_dataframe_select['combination'].tolist() if len(name) > 2] # >2 过滤掉空字符
drop_lstm_combination_feature.append("TARGET")
drop_lstm_select_combination_test = drop_lstm_data_test.loc[:, drop_lstm_combination_feature]

In [25]:
drop_lstm_select_combination_test .shape

(30751, 141)

In [28]:
import time
import gc
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from sklearn import metrics
start = time.time()
lightgbm_test(gbm, drop_lstm_select_combination_test)
end = time.time()

gbm_accuracy_score: 0.921076,
gbm_auc: 0.752975,
gbm_ks:0.374338


In [29]:
print(end - start)

17.87284207344055


In [30]:
17.87284207344055/30751

0.0005812117353400069

# 特征选择花费的时间

In [1]:
import pandas as pd

In [2]:
drop_lstm_data_train = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_train.csv")
drop_lstm_data_val = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_val.csv")
drop_lstm_data_test = pd.read_csv("C:/data/processing_data/dataset/drop_lstm_data_test.csv")

In [3]:
drop_lstm_select_combination_train_X = drop_lstm_data_train.drop(["TARGET"], axis = 1)
drop_lstm_select_combination_train_y = drop_lstm_data_train["TARGET"]
drop_lstm_select_combination_train_X.shape

(246009, 445)

In [5]:
import imblearn
import time
from imblearn.combine import SMOTETomek
start = time.time()
smt = SMOTETomek()
smt.fit(drop_lstm_select_combination_train_X, drop_lstm_select_combination_train_y)
X_train_drop_lstm_resample, y_train__drop_lstm_resample = smt.fit_resample(drop_lstm_select_combination_train_X, drop_lstm_select_combination_train_y)
end = time.time()
print("综合采样时间：", end-start)

综合采样时间： 3988.7160544395447
