In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import random
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve,auc
target_names = [ '0 Attrition','1 Attrition ']
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
def plot_roc_curve(fper, tper,roc_auc):
    plt.plot(fper,tper,lw=2,alpha=0.8,color='b',
         label='ROC Model(area=%0.3f)'% (roc_auc))
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()

In [2]:
df1 = pd.read_csv("training.csv")
df1['label'] = df1.Attrition
df1 = df1.drop(['Attrition'],axis=1)

df2 = pd.read_csv("testing.csv")
df2['label'] = df2.Attrition
df2 = df2.drop(['Attrition'],axis=1)

df_cat = df1.select_dtypes(include = ['O'])
for i in df_cat.columns:
    df1[i] = pd.Categorical(df1[i])
    df1[i] = df1[i].cat.codes
    df2[i] = pd.Categorical(df2[i])
    df2[i] = df2[i].cat.codes
df_cat.columns

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'label'],
      dtype='object')

In [None]:
df1.columns

In [3]:
dftrain,dftest = df1,df2
y = dftrain['label']
X = dftrain.drop(['label'],axis = 1)

In [13]:
models = []
K = 6
num_val_samples = int(len(X)/K)
for i in range(K):
    print('Processing fold #', i)
    x_test = X[i*num_val_samples : (i+1)*num_val_samples]
    y_test = y[i*num_val_samples : (i+1)*num_val_samples]
    x_train = np.concatenate( 
                         [X[: i*num_val_samples],
                         X[(i+1)*num_val_samples :]],
                         axis = 0)
    y_train = np.concatenate(
                         [y[: i*num_val_samples],
                         y[(i+1)*num_val_samples :]],
                         axis = 0)

    categorical_features = df_cat.columns[:-1].tolist()
    lgb_train = lgb.Dataset(x_train,label=y_train,
#                             categorical_feature = categorical_features
                           )
    lgb_valid = lgb.Dataset(dftest.drop(['label'],axis = 1),label=dftest['label'],
#                             categorical_feature = categorical_features,
                            reference=lgb_train)
#     print(x_train)
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
        return 'f1', f1_score(y_true, y_hat), True

    params = {  
        'boosting_type': 'gbdt',  
        'objective': 'binary',  
        'metric': ['f1'],  
        'tree_learner': 'data',
        'num_leaves': 64,  
        'max_depth': -1,  
        'min_data_in_leaf': 64,  
        'learning_rate': 0.09,  
        'feature_fraction': 0.9,  
        'bagging_fraction': 0.9,  
        'bagging_freq': 10,  
        'lambda_l1': 0,    
        'lambda_l2': 0,  # 越小l2正则程度越高  
        'min_gain_to_split': 0.1,  
        'verbose': 1,  
        'max_bin': 64,
#             'scale_pos_weight':5,
        'min_sum_hessian_in_leaf': 1,
        'task': 'train',

    }  
    params['is_unbalance']=True
    #================================================================================

    results = {}
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round= boost_round,
                    valid_sets=(lgb_valid, lgb_train),
                    valid_names=('validate','train'),
                    early_stopping_rounds = early_stop_rounds,
                    evals_result= results,feval=lgb_f1_score)

    models.append(gbm)

Processing fold # 0




[LightGBM] [Info] Number of positive: 160, number of negative: 821
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 405
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.163099 -> initscore=-1.635349
[LightGBM] [Info] Start training from score -1.635349
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0	validate's f1: 0
[7]	train's f1: 0.154696	validate's f1: 0.148148
[8]	train's f1: 0.311321	validate's f1: 0.253968
[9]	train's f1: 0.4329	validate's f1: 0.307692
[10]	train's f1: 0.511811	validate's f1: 0.382353
[11]	train's f1: 0.554745	validate's f1: 0.422535
[12]	train's f1: 0.561644	validate's f1: 0.421053
[13]	train's f1: 0.593548	validate's f1: 0.404762
[



[LightGBM] [Info] Number of positive: 158, number of negative: 823
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.161060 -> initscore=-1.650361
[LightGBM] [Info] Start training from score -1.650361
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0	validate's f1: 0
[7]	train's f1: 0.0952381	validate's f1: 0.0816327
[8]	train's f1: 0.235897	validate's f1: 0.241379
[9]	train's f1: 0.351852	validate's f1: 0.266667
[10]	train's f1: 0.460317	validate's f1: 0.361111
[11]	train's f1: 0.512821	validate's f1: 0.375
[12]	train's f1: 0.561873	validate's



[LightGBM] [Info] Number of positive: 159, number of negative: 822
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.162080 -> initscore=-1.642836
[LightGBM] [Info] Start training from score -1.642836
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0	validate's f1: 0
[7]	train's f1: 0.156425	validate's f1: 0.218182
[8]	train's f1: 0.338164	validate's f1: 0.333333
[9]	train's f1: 0.426087	validate's f1: 0.3125
[10]	train's f1: 0.515873	validate's f1: 0.356164
[11]	train's f1: 0.544776	validate's f1: 0.390244
[12]	train's f1: 0.591549	validate's f1: 0.436782
[13]	train's f1: 0.587459	validate's f1: 0.484211
[



[LightGBM] [Info] Number of positive: 151, number of negative: 830
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.153925 -> initscore=-1.704146
[LightGBM] [Info] Start training from score -1.704146
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0.0131579	validate's f1: 0.0816327
[7]	train's f1: 0.122699	validate's f1: 0.115385
[8]	train's f1: 0.27027	validate's f1: 0.237288
[9]	train's f1: 0.388626	validate's f1: 0.30303
[10]	train's f1: 0.455696	validate's f1: 0.356164
[11]	train's f1: 0.527473	validate's f1: 0.425
[12]	train's f1: 0.57931	validate's f1: 0.516854
[13]	train's f1: 0.593548	validate's f1



[LightGBM] [Info] Number of positive: 157, number of negative: 824
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.160041 -> initscore=-1.657925
[LightGBM] [Info] Start training from score -1.657925
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0	validate's f1: 0
[7]	train's f1: 0.150289	validate's f1: 0.153846
[8]	train's f1: 0.29	validate's f1: 0.3125
[9]	train's f1: 0.384279	validate's f1: 0.333333
[10]	train's f1: 0.452381	validate's f1: 0.352941
[11]	train's f1: 0.505415	validate's f1: 0.37037
[12]	train's f1: 0.554054	validate's f1: 0.363636
[13]	train's f1: 0.574194	validate's f1: 0.382979
[14]	t



[LightGBM] [Info] Number of positive: 165, number of negative: 816
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168196 -> initscore=-1.598469
[LightGBM] [Info] Start training from score -1.598469
[1]	train's f1: 0	validate's f1: 0
Training until validation scores don't improve for 10 rounds
[2]	train's f1: 0	validate's f1: 0
[3]	train's f1: 0	validate's f1: 0
[4]	train's f1: 0	validate's f1: 0
[5]	train's f1: 0	validate's f1: 0
[6]	train's f1: 0	validate's f1: 0
[7]	train's f1: 0.0804598	validate's f1: 0.156863
[8]	train's f1: 0.243655	validate's f1: 0.245614
[9]	train's f1: 0.383929	validate's f1: 0.307692
[10]	train's f1: 0.466926	validate's f1: 0.352941
[11]	train's f1: 0.52669	validate's f1: 0.473684
[12]	train's f1: 0.565657	validate's f1: 0.45
[13]	train's f1: 0.592593	validate's f1: 0.477273
[14

In [45]:
w = []
for model in models:
    y_pred_test = model.predict(dftest.drop('label',axis = 1), num_iteration=model.best_iteration)
    w.append(f1_score(dftest['label'],y_pred_test>0.5,average='binary'))
print(w)
y_pred_train = np.array(models[0].predict(dftrain.drop('label',axis = 1), num_iteration=models[0].best_iteration))*w[0]
y_pred_test = np.array(models[0].predict(dftest.drop('label',axis = 1), num_iteration=models[0].best_iteration))*w[0]

for e,model in enumerate(models[1:]):
    y_pred_train+=np.array(model.predict(dftrain.drop('label',axis = 1), num_iteration=model.best_iteration))*w[e]
    y_pred_test+=np.array(model.predict(dftest.drop('label',axis = 1), num_iteration=model.best_iteration))*w[e]
y_pred_train/=sum(w)
y_pred_test/=sum(w)
print('train accuracy: {:.5} '.format(accuracy_score(dftrain['label'],y_pred_train>0.5)))
print('valid accuracy: {:.5} \n'.format(accuracy_score(dftest['label'],y_pred_test>0.5)))

# lgb.plot_metric(best_r)
# lgb.plot_importance(best_model,importance_type = "gain")

[0.4421052631578947, 0.41379310344827586, 0.5161290322580645, 0.5168539325842697, 0.4705882352941177, 0.504201680672269]
train accuracy: 0.87086 
valid accuracy: 0.81911 



In [46]:
confusion_matrix(dftest['label'],y_pred_test>0.5)

array([[216,  30],
       [ 23,  24]])

In [47]:
print(classification_report(dftest['label'],y_pred_test>0.5))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       246
           1       0.44      0.51      0.48        47

    accuracy                           0.82       293
   macro avg       0.67      0.69      0.68       293
weighted avg       0.83      0.82      0.82       293



In [48]:
f1_score(dftest['label'],y_pred_test>0.5,average='binary')

0.4752475247524752