# 基于模型泛化效果、可解释性考虑，初定选定以下三种模型框架进行尝试

## 1.具有强解释性的决策树模型

## 2.以bagging思想进行集成集成的随机森林模型

## 3.以boosting思想进行集成的xgb模型

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree  
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import roc_auc_score
import joblib

In [2]:
def auc_score_fig(clf,x,y_test):  #计算auc，三参数分别为分类器，特征与标签
    y_probs = clf.predict_proba(x)
    y_probs = y_probs[:,1]
    auc = roc_auc_score(y_test, y_probs)
    return auc

In [3]:
#读取数据
df_train = pd.read_csv(r"C:\Users\17738\贷款违约率预测\数据集\train_clean.csv")
df_test = pd.read_csv(r"C:\Users\17738\贷款违约率预测\数据集\test_clean.csv")

在训练数据中单独划分一个测试集用来评估模型的泛化效果

In [4]:
train_data, test_data = train_test_split(df_train, test_size=0.2, random_state=1)   

In [5]:
train_data.groupby("isDefault").count()

Unnamed: 0_level_0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
isDefault,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,512351,512351,512351,512351,512351,512351,512351,512351,512351,512351,...,512351,512351,512351,512351,512351,512351,512351,512351,512351,512351
1,127649,127649,127649,127649,127649,127649,127649,127649,127649,127649,...,127649,127649,127649,127649,127649,127649,127649,127649,127649,127649


In [6]:
test_data.groupby("isDefault").count()

Unnamed: 0_level_0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
isDefault,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,128039,128039,128039,128039,128039,128039,128039,128039,128039,128039,...,128039,128039,128039,128039,128039,128039,128039,128039,128039,128039
1,31961,31961,31961,31961,31961,31961,31961,31961,31961,31961,...,31961,31961,31961,31961,31961,31961,31961,31961,31961,31961


In [7]:
#将特征与标签从数据集中剥离出来
train_features = train_data.drop(columns = ["isDefault"]).copy()
train_target = train_data[["isDefault"]].copy()

test_features = test_data.drop(columns = ["isDefault"]).copy()
test_target = test_data[["isDefault"]].copy()


# 决策树


确定决策树的核心参数以及参数范围

In [46]:
#算力有限，此处跨度设定很粗
max_depth = [i for i in range(5,11,2)] 
min_samples_split = [i for i in range(2,100,20)] 
min_samples_leaf = [i for i in range(1,50,10)] 
max_features = [i/10 for i in range(1,10,2)] 
criterion = ["gini","entropy"]
max_leaf_nodes = [100,500,50]
class_weight = [{0:1,1:1},{0:2,1:1},{0:3,1:1},{0:4,1:1}]

将参数设定为网格搜索可使用的字典形式，因排列组合过多，可分批次进行搜索探索

In [47]:
params_state1 = {"max_depth":max_depth,"min_samples_split":min_samples_split,"min_samples_leaf":min_samples_leaf}
params_state2 = {"max_features":max_features,"criterion":criterion,"max_leaf_nodes":max_leaf_nodes}
params_state3 = {"class_weight":class_weight}

In [None]:
best_params = {}
clf_dt = tree.DecisionTreeClassifier()
for each in [params_state1,params_state2,params_state3]:
    clf_dt_mid = GridSearchCV(estimator=clf_dt, param_grid=each, scoring='roc_auc', cv=5,verbose=2)
    clf_dt_mid.fit(train_features,train_target)
    best_params.update(clf_dt_mid.best_params_)
    clf_dt = tree.DecisionTreeClassifier(**best_params)
#模型训练
clf_dt.fit(train_features,train_target)

In [52]:
#打印模型最优的参数与分数
# print(clf_dt.best_params_)
# print(clf_dt.best_score_)
clf_dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': {0: 1, 1: 1},
 'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 0.9,
 'max_leaf_nodes': 100,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 62,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [53]:
#输出模型最终在测试集上的表现
clf_dt_auc = auc_score_fig(clf_dt,test_features,test_target)
print("决策树模型最终在测试集上的auc为：",clf_dt_auc)

决策树模型最终在测试集上的auc为： 0.7092574311774612


In [54]:
#模型保存
joblib.dump(clf_dt, r'C:\Users\17738\贷款违约率预测\模型存储\决策树_风控模型.pkl')

['C:\\Users\\17738\\贷款违约率预测\\模型存储\\决策树_风控模型.pkl']

# 随机森林

In [27]:
#算力有限，此处跨度设定很粗
n_estimators =[i for i in range(100,500,200)]
max_depth = [i for i in range(10,100,40)] 
min_samples_split = [i for i in range(2,20,8)] 
min_samples_leaf = [i for i in range(1,10,3)] 
max_features = [i/10 for i in range(5,10,2)] 

In [28]:
params_state1 = {"n_estimators":n_estimators,"max_depth":max_depth}
params_state2 = {"min_samples_split":min_samples_split,"min_samples_leaf":min_samples_leaf}
params_state3 = {"max_features":max_features}

In [None]:
best_params = {}
clf_rf = RandomForestClassifier()
for each in [params_state1,params_state2,params_state3]:
    clf_rf_mid = GridSearchCV(estimator=clf_rf, param_grid=each, scoring='roc_auc', cv=5,verbose=2)
    clf_rf_mid.fit(train_features,train_target)
    best_params.update(clf_rf_mid.best_params_)
    clf_rf = RandomForestClassifier(**best_params)

In [33]:
#模型训练
clf_rf = RandomForestClassifier(max_depth = 10, n_estimators = 300)  #算力有限，此处仅调整部分参数
clf_rf.fit(train_features,train_target)

  clf_rf.fit(train_features,train_target)


RandomForestClassifier(max_depth=10, n_estimators=300)

In [34]:
clf_rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [35]:
#输出模型最终在测试集上的表现
clf_rf_auc = auc_score_fig(clf_rf,test_features,test_target)
print("随机森林模型最终在测试集上的auc为：",clf_rf_auc)

决策树模型最终在测试集上的auc为： 0.7183534799913893


In [36]:
#模型保存
joblib.dump(clf_rf, r'C:\Users\17738\贷款违约率预测\模型存储\随机森林_风控模型.pkl')

['C:\\Users\\17738\\贷款违约率预测\\模型存储\\随机森林_风控模型.pkl']

# XGBOOST

In [37]:
#算力有限，此处跨度设定很粗
n_estimators_list=[i for i in range(1,121,40)]   
learning_rate_list=[i/100 for i in range(1,42,5)]  
max_depth_list=[i for i in range(3,11,4)]
max_delta_step_list=[i for i in range(1,11,4)]  
min_child_weight_list=[i for i in range(1,13,4)]
gamma_list=[i/10 for i in range(1,11,4)]
subsample_list=[i/10 for i in range(5,11,4)]
colsample_bytree_list=[i/10 for i in range(5,11,4)]
scale_pos_weight_list=[i/10 for i in range(5,42,15)]   
reg_alpha_list=[0, 0.001, 0.005, 0.01,0.05, 0.1, 1, 2, 3]
reg_lambda_list=[0, 0.001, 0.005, 0.01,0.05, 0.1, 1, 2, 3]

In [38]:
params_state1 = {'n_estimators': n_estimators_list,'learning_rate':learning_rate_list,'objective':['binary:logistic']}
params_state2 = {'scale_pos_weight': scale_pos_weight_list,'max_delta_step':max_delta_step_list}
params_state3 = {'max_depth':max_depth_list,'min_child_weight':min_child_weight_list,'gamma':gamma_list}
params_state4 = {'subsample':subsample_list,'colsample_bytree':colsample_bytree_list}
params_state5 = {'reg_alpha':reg_alpha_list,'reg_lambda':reg_lambda_list}

In [None]:
best_params = {}
clf_xgb = XGBClassifier()
for each in [params_state1,params_state2,params_state3]:
    clf_xgb_mid = GridSearchCV(estimator=clf_xgb, param_grid=each, scoring='roc_auc', cv=5,verbose=2)
    clf_xgb_mid.fit(train_features,train_target)
    best_params.update(clf_xgb_mid.best_params_)
    clf_xgb = XGBClassifier(**best_params)
#模型训练
clf_xgb.fit(train_features,train_target)

In [42]:
#模型训练
clf_xgb = XGBClassifier()  #算力有限，此处仅展示调参代码，仍使用默认参数作为最终结果
clf_xgb.fit(train_features,train_target)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [43]:
clf_xgb.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [44]:
#输出模型最终在测试集上的表现
clf_xgb_auc = auc_score_fig(clf_xgb,test_features,test_target)
print("XGB模型最终在测试集上的auc为：",clf_xgb_auc)

XGB模型最终在测试集上的auc为： 0.7338299478711378


In [45]:
#模型保存
joblib.dump(clf_xgb, r'C:\Users\17738\贷款违约率预测\模型存储\随XGB_风控模型.pkl')

['C:\\Users\\17738\\贷款违约率预测\\模型存储\\随XGB_风控模型.pkl']

# 项目代码仅为展现本人在数据挖掘方面的技能。因时间与算力有限，当前内容仅完成模型搭建的核心环节，后期会逐步完善以下内容：

# 1.特征衍生：

    时序特征衍生：依据生活规律对时序特征进行拓展衍生
    
    特征交叉衍生：将相关特征利用常用统计函数/线性组合等方式进行交叉组合衍生；
    
    经验衍生：依据业务经验，重新构造一些可解释行的特征；
    
    
# 2.调参优化：

    尝试使用随机调参与贝叶斯调参进行参数组合的探索；
    
# 3.模型融合：

    尝试使用平均法与投票法进行模型融合；
    
# 4.AB测试模块编写：

    编写小样本检验逻辑，测试不同模型的实际执行效果；