In [29]:
import gc
import time
from datetime import date
import pandas as pd
import pandas_profiling as pdp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)

In [30]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

## 读取数据

In [31]:
df_all = pd.read_csv('./input/bank-additional-train.csv')
df_all['y'].replace(['no','yes'],[0,1],inplace=True)

In [32]:
df_all.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,32,management,married,professional.course,unknown,no,no,cellular,jul,mon,565,4,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,0
1,41,admin.,married,high.school,no,yes,yes,cellular,apr,mon,982,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
2,32,admin.,married,university.degree,no,yes,no,cellular,may,mon,188,2,999,0,nonexistent,-1.8,92.893,-46.2,1.299,5099.1,0
3,37,admin.,married,high.school,no,unknown,unknown,cellular,jul,thu,56,3,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,0
4,38,admin.,divorced,university.degree,no,no,no,cellular,jul,tue,419,8,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,0


In [33]:
# nan值统计
df_all.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

## 数据探索

In [34]:
# profile = pdp.ProfileReport(df_all, check_correlation=False)
# profile.to_file(outputfile="output_df_train.html")

In [35]:
# profile

## 数据预处理

In [36]:
# 数值型数据处理
# 'age', 'balance', 'duration', 'campaign', 'pdays', 'previous'，emp.var.rate，cons.price.idx，cons.conf.idx，euribor3m，nr.employed 
def standardize_nan(x):
    # 标准化
    x_mean = np.nanmean(x) # 求平均值，但是个数不包括nan
    x_std = np.nanstd(x)
    return (x - x_mean) / x_std    
# 対数変換
## 直方图绘制时分布不均匀的特征
df_all['log_age'] = np.log(df_all['age'])
df_all['log_std_age'] = standardize_nan(df_all['log_age'])
# df_all["log_duration"] = np.log(df_all['duration']+ 1) # duration 字段不能用
df_all["log_campaign"] = np.log(df_all['campaign'] + 1)
df_all["log_pdays"] = np.log(df_all['pdays']- df_all['pdays'].min() + 1)
df_all['log_previous'] = np.log(df_all['previous']+1) # 这里没有+1
df_all = df_all.drop(["age", "duration", "campaign", "pdays","previous"], axis=1) # duration 字段不能用

df_all['log_emp.var.rate'] = np.log(df_all['emp.var.rate']+1) # 这里没有+1
df_all['log_cons.price.idx'] = np.log(df_all['cons.price.idx']+1) # 这里没有+1
df_all['log_euribor3m'] = np.log(df_all['euribor3m']+1) # 这里没有+1
df_all['log_nr.employed '] = np.log(df_all['nr.employed']+1) # 这里没有+1
df_all = df_all.drop(["emp.var.rate","cons.price.idx", "euribor3m", "nr.employed"], axis=1)




In [37]:
# month 文字列与数値的変換
df_all['month'] = df_all['month'].map({'jan': 1,
                                'feb': 2,
                                'mar': 3,
                                'apr': 4,
                                'may': 5,
                                'jun': 6,
                                'jul': 7,
                                'aug': 8,
                                'sep': 9,
                                'oct': 10,
                                'nov': 11,
                                'dec': 12
                                 }).astype(int)
# 1月:0、2月:31、3月:(31+28)、4月:(31+28+31)、 ...
day_sum = pd.Series(np.cumsum([0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30]), index=np.arange(1, 13))
df_all['date'] = (df_all['month'].map(day_sum)).astype(int)    

In [38]:
# 类别型数据
cate_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day_of_week', 'month', 'poutcome']
df_all = pd.get_dummies(df_all, columns=cate_cols)

In [39]:
df_all.head()

Unnamed: 0,cons.conf.idx,y,log_age,log_std_age,log_campaign,log_pdays,log_previous,log_emp.var.rate,log_cons.price.idx,log_euribor3m,log_nr.employed,date,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,poutcome_failure,poutcome_nonexistent,poutcome_success
0,-42.7,0,3.465736,-0.756985,1.609438,6.907755,0.0,0.875469,4.553013,1.785406,8.561994,181,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,-47.1,0,3.713572,0.220837,0.693147,6.907755,0.0,,4.544092,0.87755,8.537015,90,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2,-46.2,0,3.465736,-0.756985,1.098612,6.907755,0.0,,4.542156,0.832474,8.537015,120,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,-42.7,0,3.610918,-0.184179,1.386294,6.907755,0.0,0.875469,4.553013,1.786412,8.561994,181,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,-42.7,0,3.637586,-0.078961,2.197225,6.907755,0.0,0.875469,4.553013,1.785238,8.561994,181,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [40]:
df_all.shape,df_all.columns

((37069, 65),
 Index(['cons.conf.idx', 'y', 'log_age', 'log_std_age', 'log_campaign',
        'log_pdays', 'log_previous', 'log_emp.var.rate', 'log_cons.price.idx',
        'log_euribor3m', 'log_nr.employed ', 'date', 'job_admin.',
        'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
        'job_management', 'job_retired', 'job_self-employed', 'job_services',
        'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
        'marital_divorced', 'marital_married', 'marital_single',
        'marital_unknown', 'education_basic.4y', 'education_basic.6y',
        'education_basic.9y', 'education_high.school', 'education_illiterate',
        'education_professional.course', 'education_university.degree',
        'education_unknown', 'default_no', 'default_unknown', 'default_yes',
        'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
        'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
        'day_of_week_fri', 'day_of_week_mon'

In [41]:
cols = [col for col in df_all.columns if col not in ['y']]
X_train,X_test,y_train,y_test=train_test_split(df_all[cols],df_all['y'],test_size=0.3,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((25948, 64), (11121, 64), (25948,), (11121,))

## 建立模型

In [48]:
gbm = LGBMClassifier(n_estimators=4000,
                    learning_rate=0.03,
                    num_leaves=30,
                    colsample_bytree=.8,
                    subsample=.9,
                    max_depth=7,
                    reg_alpha=.1,
                    reg_lambda=.1,
                    min_split_gain=.01,
                    min_child_weight=2,
                    verbose=1)
# gbm_cv = GridSearchCV(gbm, params2, scoring= 'roc_auc',verbose=False)
gbm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        learning_rate=0.03, max_depth=7, min_child_samples=20,
        min_child_weight=2, min_split_gain=0.01, n_estimators=4000,
        n_jobs=-1, num_leaves=30, objective=None, random_state=None,
        reg_alpha=0.1, reg_lambda=0.1, silent=True, subsample=0.9,
        subsample_for_bin=200000, subsample_freq=1, verbose=1)

In [43]:
# gbm_cv.best_params_

In [49]:
y_pred = gbm.predict_proba(X_test)[:,1]
roc_auc_score(y_test,y_pred)

0.7680465064737227

In [50]:
def plot_fea_importance(classifier,X_train):
    plt.figure(figsize=(10,12))
    name = "xgb"
    indices = np.argsort(classifier.feature_importances_)[::-1][:40]
    g = sns.barplot(y=X_train.columns[indices][:40],
                    x=classifier.feature_importances_[indices][:40],orient='h')
    g.set_xlabel("Relative importance", fontsize=12)
    g.set_ylabel("Features", fontsize=12)
    g.tick_params(labelsize=9)
    g.set_title(name + " feature importance")
    plt.show()
plot_fea_importance(xgb,X_train)