In [250]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [251]:
def cal_ks(label,score):
    fpr,tpr,thresholds= roc_curve(label,score)
    return max(tpr-fpr)
    
def cal_auc_ks(y_true, y_pred, name = None, save=False):
    sample = name + " Sample : %s" % len(y_true)
    auc = name + ' test_set auc : %0.3f' % roc_auc_score(y_true, y_pred)
    ks = name + ' test_set ks  : %0.3f' % cal_ks(y_true,y_pred) 
    print (sample)
    print (auc)
    print (ks)
    print ('----------------cal_auc_ks process successfully!----------------')
    if save:
        if name:
            pass
        else:
            name = ''
        with open(name + '_auc&ks.txt', 'a+') as f:
            f.write(sample + '\n' + auc + '\n' + ks + '\n' + '------------------------------------' + '\n' )
            print ('----------------cal_auc_ks save successfully!----------------')
    return roc_auc_score(y_true, y_pred), cal_ks(y_true,y_pred) 

In [252]:
def dimensions_of_data(file_name):
    max_dimension = -1
    min_dimension = 1e6
    with open(file_name, 'r') as f:    
        lines = f.readlines()
        for line in lines[1:2]:
            data, label = line.split('\t')
            
            for pair in data.split(' '):
                index = int(pair.split(':')[0])
                max_dimension = max(index, max_dimension)
                min_dimension = min(index, min_dimension)
                
    return max_dimension, min_dimension
    

In [253]:
train_dims, train_min_dims = dimensions_of_data('train.data')

print(dims, min_dims)

test_dims, test_min_dims = dimensions_of_data('test.data')
print(dims, min_dims)

dims = max(train_dims, test_dims)

2839 3
2839 3


In [254]:
def read_data(file_name, dims):
    
    with open(file_name, 'r') as f:    
        lines = f.readlines()        
        
        rows, cols = len(lines) - 1, dims + 2
        all_data = np.array([[np.nan for _ in range(cols)] for _ in range(rows)])

        for idx, line in enumerate(lines[1:]):
            data, label = line.split('\t')
            
            all_data[idx, -1] = float(label)
            
            for pair in data.split(' '):
                index = int(pair.split(':')[0])
                val = pair.split(':')[1]
                
                all_data[idx, index] = val
    
    return pd.DataFrame(all_data)
        
        

In [255]:
train = read_data('train.data', dims)
test = read_data('test.data', dims)

In [256]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2831,2832,2833,2834,2835,2836,2837,2838,2839,2840
0,,,,,,,,,1.0,,...,-1.0,1.0,,,,,,,,1.0
1,,,,3.14064,,,,3.34163,0.0,,...,,,1.0,4.45559,3.82093,1.0,1.0,0.0,-1.0,1.0
2,,,,,,,,,0.0,,...,-1.0,1.0,,,,,,,,1.0
3,,,,,,,,,,,...,-1.0,1.0,,,,,,,,1.0
4,,,,,,,,,0.0,,...,-1.0,1.0,1.24573,0.0,0.0,1.0,0.0,0.0,-1.0,0.0


In [257]:
# 分析数据是否平衡
train.iloc[:, -1].value_counts()

1.0    2541
0.0    2458
Name: 2840, dtype: int64

In [243]:
#分析数据缺失值
def count_missing_values(data):
    return data.isna().sum() / data.shape[0]
    
missing_rate = count_missing_values(train)
dropped_colums = list(missing_rate[missing_rate >= 0.5].index)

# def delete_missing_cols(data, dropped_colums):
#     data.drop(dropped_colums, axis=1, inplace=True)
    
# delete_missing_cols(train, dropped_colums)
# delete_missing_cols(test, dropped_colums)

In [261]:
missing_rate

0       0.991798
1       1.000000
2       1.000000
3       0.369674
4       0.984397
5       0.994599
6       0.976795
7       0.369674
8       0.085817
9       0.976795
10      0.988798
11      0.533507
12      0.873775
13      1.000000
14      1.000000
15      0.552310
16      1.000000
17      0.948590
18      0.988798
19      0.997600
20      0.535107
21      1.000000
22      0.984397
23      0.857772
24      0.575315
25      0.894579
26      0.988798
27      0.991798
28      0.999400
29      0.991798
          ...   
2811    0.566513
2812    0.566513
2813    0.566513
2814    0.566513
2815    0.566513
2816    0.566513
2817    0.566513
2818    0.566513
2819    0.566513
2820    0.566513
2821    0.566513
2822    0.566513
2823    0.566513
2824    0.566513
2825    0.566513
2826    0.566513
2827    0.566513
2828    0.566513
2829    0.566513
2830    0.566513
2831    0.566513
2832    0.566513
2833    0.620724
2834    0.620724
2835    0.620724
2836    0.620724
2837    0.620724
2838    0.6207

In [271]:
unique_values = [pd.unique(train.iloc[:, c]).shape[0] for c in range(train.shape[1])]

In [272]:
unique_values

[24,
 1,
 1,
 1729,
 34,
 13,
 47,
 1132,
 3,
 63,
 28,
 27,
 99,
 1,
 1,
 10,
 1,
 9,
 14,
 4,
 469,
 1,
 48,
 12,
 1493,
 179,
 42,
 7,
 2,
 20,
 27,
 1,
 21,
 8,
 2,
 8,
 187,
 63,
 1,
 4,
 1,
 4,
 2,
 1,
 1,
 9,
 2,
 5,
 4,
 144,
 255,
 1,
 1,
 160,
 1964,
 31,
 1,
 11,
 41,
 10,
 52,
 3,
 11,
 16,
 2,
 291,
 4,
 1,
 1,
 1,
 5,
 52,
 180,
 34,
 1005,
 2,
 4,
 242,
 1,
 1,
 3,
 5,
 1,
 93,
 1,
 18,
 436,
 5,
 1,
 12,
 7,
 3,
 16,
 12,
 573,
 4,
 5,
 270,
 5,
 5,
 1,
 321,
 1783,
 1,
 28,
 2,
 2,
 3,
 518,
 19,
 2,
 2,
 1534,
 48,
 2,
 5,
 7,
 2,
 255,
 88,
 92,
 4,
 5,
 6,
 1,
 18,
 1,
 1,
 52,
 1,
 6,
 3,
 1055,
 4,
 2,
 3,
 1,
 1,
 1,
 4,
 115,
 5,
 662,
 17,
 5,
 2,
 5,
 1,
 1,
 5,
 2,
 1,
 4,
 1,
 241,
 1,
 1,
 43,
 203,
 110,
 7,
 12,
 24,
 1,
 86,
 51,
 145,
 104,
 1,
 2,
 1,
 104,
 1,
 1,
 22,
 3,
 5,
 1,
 169,
 1,
 5,
 1,
 38,
 1,
 42,
 2,
 1,
 1,
 128,
 3,
 1,
 3,
 255,
 4,
 22,
 1,
 1,
 7,
 32,
 1,
 7,
 7,
 65,
 1,
 5,
 3,
 147,
 1,
 17,
 7,
 788,
 1,
 5,
 244,
 1158,
 499

In [245]:
train_train_X, train_validation_X, train_train_y, train_validation_y = train_test_split(train.iloc[:, :-1], train.iloc[:, -1],test_size=0.3, random_state=4000, stratify=train.iloc[:, -1])


train_ds = lgb.Dataset(data=train_train_X, label=train_train_y)
train_validation_ds = lgb.Dataset(data=train_validation_X, label=train_validation_y)

# cv_res = lgb.cv(param, train_ds, verbose_eval=True, shuffle=True)

In [248]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
#          "metric": 'binary_logloss',
         "metric": 'binary_error',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}


clf = lgb.train(param, train_ds, num_boost_round=1000, verbose_eval=True, valid_sets=train_validation_ds, early_stopping_rounds=500)
test_prob = clf.predict(test.iloc[:, :-1])

[1]	valid_0's binary_error: 0.492
Training until validation scores don't improve for 500 rounds.
[2]	valid_0's binary_error: 0.492
[3]	valid_0's binary_error: 0.464667
[4]	valid_0's binary_error: 0.43
[5]	valid_0's binary_error: 0.408
[6]	valid_0's binary_error: 0.395333
[7]	valid_0's binary_error: 0.383333
[8]	valid_0's binary_error: 0.373333
[9]	valid_0's binary_error: 0.364667
[10]	valid_0's binary_error: 0.36
[11]	valid_0's binary_error: 0.355333
[12]	valid_0's binary_error: 0.352
[13]	valid_0's binary_error: 0.353333
[14]	valid_0's binary_error: 0.353333
[15]	valid_0's binary_error: 0.356
[16]	valid_0's binary_error: 0.358
[17]	valid_0's binary_error: 0.352
[18]	valid_0's binary_error: 0.348667
[19]	valid_0's binary_error: 0.345333
[20]	valid_0's binary_error: 0.340667
[21]	valid_0's binary_error: 0.336667
[22]	valid_0's binary_error: 0.335333
[23]	valid_0's binary_error: 0.336667
[24]	valid_0's binary_error: 0.338667
[25]	valid_0's binary_error: 0.338
[26]	valid_0's binary_error:

[220]	valid_0's binary_error: 0.333333
[221]	valid_0's binary_error: 0.335333
[222]	valid_0's binary_error: 0.334
[223]	valid_0's binary_error: 0.334
[224]	valid_0's binary_error: 0.333333
[225]	valid_0's binary_error: 0.332667
[226]	valid_0's binary_error: 0.332667
[227]	valid_0's binary_error: 0.334
[228]	valid_0's binary_error: 0.333333
[229]	valid_0's binary_error: 0.334667
[230]	valid_0's binary_error: 0.331333
[231]	valid_0's binary_error: 0.331333
[232]	valid_0's binary_error: 0.33
[233]	valid_0's binary_error: 0.332
[234]	valid_0's binary_error: 0.333333
[235]	valid_0's binary_error: 0.332667
[236]	valid_0's binary_error: 0.332667
[237]	valid_0's binary_error: 0.333333
[238]	valid_0's binary_error: 0.331333
[239]	valid_0's binary_error: 0.332
[240]	valid_0's binary_error: 0.332
[241]	valid_0's binary_error: 0.332
[242]	valid_0's binary_error: 0.332
[243]	valid_0's binary_error: 0.332
[244]	valid_0's binary_error: 0.333333
[245]	valid_0's binary_error: 0.332667
[246]	valid_0's b

[439]	valid_0's binary_error: 0.338667
[440]	valid_0's binary_error: 0.338667
[441]	valid_0's binary_error: 0.338667
[442]	valid_0's binary_error: 0.338
[443]	valid_0's binary_error: 0.336
[444]	valid_0's binary_error: 0.337333
[445]	valid_0's binary_error: 0.338667
[446]	valid_0's binary_error: 0.337333
[447]	valid_0's binary_error: 0.339333
[448]	valid_0's binary_error: 0.339333
[449]	valid_0's binary_error: 0.339333
[450]	valid_0's binary_error: 0.338
[451]	valid_0's binary_error: 0.34
[452]	valid_0's binary_error: 0.34
[453]	valid_0's binary_error: 0.34
[454]	valid_0's binary_error: 0.338667
[455]	valid_0's binary_error: 0.338
[456]	valid_0's binary_error: 0.338
[457]	valid_0's binary_error: 0.337333
[458]	valid_0's binary_error: 0.337333
[459]	valid_0's binary_error: 0.336667
[460]	valid_0's binary_error: 0.337333
[461]	valid_0's binary_error: 0.338667
[462]	valid_0's binary_error: 0.339333
[463]	valid_0's binary_error: 0.339333
[464]	valid_0's binary_error: 0.34
[465]	valid_0's b

[656]	valid_0's binary_error: 0.344667
[657]	valid_0's binary_error: 0.344667
[658]	valid_0's binary_error: 0.344667
[659]	valid_0's binary_error: 0.343333
[660]	valid_0's binary_error: 0.343333
[661]	valid_0's binary_error: 0.343333
[662]	valid_0's binary_error: 0.343333
[663]	valid_0's binary_error: 0.343333
[664]	valid_0's binary_error: 0.343333
[665]	valid_0's binary_error: 0.343333
[666]	valid_0's binary_error: 0.343333
[667]	valid_0's binary_error: 0.343333
[668]	valid_0's binary_error: 0.343333
[669]	valid_0's binary_error: 0.342667
[670]	valid_0's binary_error: 0.345333
[671]	valid_0's binary_error: 0.346
[672]	valid_0's binary_error: 0.345333
[673]	valid_0's binary_error: 0.344667
[674]	valid_0's binary_error: 0.343333
[675]	valid_0's binary_error: 0.344
[676]	valid_0's binary_error: 0.344667
[677]	valid_0's binary_error: 0.345333
[678]	valid_0's binary_error: 0.344
[679]	valid_0's binary_error: 0.344
[680]	valid_0's binary_error: 0.342667
[681]	valid_0's binary_error: 0.344
[

In [280]:
#分析feature importance，做feature selection

def feature_importance(feature_importance, feature_name):
    
    importance = feature_importance
    names = feature_name

    sorted_idx = np.argsort(importance)[::-1]
    importance = importance[sorted_idx]
    names = names[sorted_idx]

    with open('feature_importance_lgb.txt', 'w') as file:
        for index, im in enumerate(importance):
            string = names[index] + ',' + str(im) + '\n'
            file.write(string)
            
feature_importance(np.array(clf.feature_importance()), np.array(clf.feature_name()))

In [249]:
def prop_to_label(data, threshold):
    return np.array([1 if val > threshold else 0 for val in data])


threshold = 0.5
test_label = prop_to_label(test_prob, threshold)

cal_auc_ks(test.iloc[:, -1].values.tolist(), test_label.tolist(), name='results')
print('confusion matrix', confusion_matrix(test.iloc[:, -1].values.tolist(), test_label.tolist()))
print('f1', f1_score(test.iloc[:, -1].values.tolist(), test_label.tolist()))

results Sample : 2999
results test_set auc : 0.631
results test_set ks  : 0.262
----------------cal_auc_ks process successfully!----------------
confusion matrix [[1438  332]
 [ 677  552]]
f1 0.522479886417416
