In [1]:
import os
path = os.getcwd()

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 1  清洗数据

In [3]:
# 获取特征提取后的train、test数据
train = pd.read_csv(path + '/features/train.csv', sep = '\t')
test = pd.read_csv(path + '/features/test.csv', sep = '\t')

In [4]:
# 缺失值处理
train.fillna(-999, inplace = True)
test.fillna(-999, inplace = True)

In [5]:
# 所有列名
features = list(train.columns)
features.pop(0)
for i in features:
    print(i)

FLAG
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
V29
V30
USER_TIM_COUNT
USER_UNI_TIM_COUNT
TIME_GAP_MEAN
TIME_GAP_MAX
TIME_GAP_MIN
TIME_GAP_STD
TIME_GAP_SKEW
TIME_GAP_KURT
ACT_DATE_MEAN
DATE_ACT_MEAN
DATE_REPEAT_COUNT
ACT_COUNT_CON_DAY_MEAN
ACT_COUNT_CON_DAY_MAX
ACT_COUNT_CON_DAY_MIN
ACT_COUNT_CON_DAY_SUM
ACT_COUNT_CON_DAY_STD
ACT_COUNT_MAX
EVT_LBL_1_COUNT
EVT_LBL_2_COUNT
EVT_LBL_3_COUNT
DAY
IS_HIGT_ACT
ACT_COUNT_PER_DAY
ACT_COUNT_PER_DAY_MEAN
ACT_COUNT_PER_DAY_MAX
ACT_COUNT_PER_DAY_MIN
ACT_COUNT_PER_DAY_MODE
ACT_COUNT_PER_DAY_STD
ACT_COUNT_PER_DAY_SKEW
ACT_COUNT_PER_DAY_KURT
ACT_COUNT_PER_DAY_MEDIAN
NEXT_TIME_MAX
NEXT_TIME_STD
NEXT_TIME_MEAN
NEXT_TIME_MIN
WEEK
ACT_COUNT_PER_WEEK
ACT_COUNT_PER_WEEK_MEAN
ACT_COUNT_PER_WEEK_MAX
ACT_COUNT_PER_WEEK_MIN
ACT_COUNT_PER_WEEK_MODE
ACT_COUNT_PER_WEEK_STD
ACT_COUNT_PER_WEEK_SKEW
ACT_COUNT_PER_WEEK_KURT
ACT_COUNT_PER_WEEK_MEDIAN
USER_TIM_COUNT_BEFORE
USER_UNI_TIM_COUNT_BEFORE
TIME_GAP_MEAN

# 2  探索特征数据

In [None]:
# 导入统计分析模块
from scipy import stats

# 察看各变量间的P值与拟合情况
statistic = []
for i in range(len(features) - 1):
    stat = []
    for j in range(i + 1, len(features)):
        # 计算两变量间的相关系数、P值
        r, p = stats.pearsonr(train[features[i]], train[features[j]])
        stat.append(features[i])
        stat.append(features[j])
        stat.append(r)
        stat.append(p)
    statistic.append(stat)

In [None]:
feature_stats = pd.DataFrame(statistic,
                             columns = ['FEATURE_X',
                                        'FEATURE_Y',
                                        'PERSON_R',
                                        'P']
                            )
del statistic

In [None]:
feature_stats = feature_stats.groupby(['FEATURE_X'])
feature_stats = feature_stats.sort_values(['P', 'PERSON_R'], ascending = [True, False])
feature_stats.info()

In [None]:
feature_stats.to_csv(path + '/features/feature_stats.csv', index = None, sep = '\t')
del feature_stats

In [None]:
# 相关性最大的前100个特征间的相关系数显示
corr = train[features].corr()
feature_nums =  101
cols =  corr.nlargest(feature_nums, 'FLAG')['FLAG'].index
cm = np.corrcoef(train[cols].values.T)

f, ax = plt.subplots(figsize = (100, 100))
sns.set(font_scale = 1.25)
hm = sns.heatmap(cm,
                 cbar = True,
                 annot = True,
                 square = True,
                 fmt = '.2f',
                 annot_kws = {'size' : 8},
                 yticklabels = cols.values,
                 xticklabels = cols.values
                )

# 设置刻度字体大小  
plt.xticks(fontsize = 12)  
plt.yticks(fontsize = 12)
plt.show()

In [None]:
fig = plt.figure(figsize = (8, 6))
for i in range(1, 96):
    data_sub = pd.concat([train['FLAG'], train[features[j]]], axis = 1)
    sns.set_palette('muted')
    sns.jointplot(x = 'FLAG',
                  y = features[j],
                  data = data_sub,
                  kind = 'kde',
                  color = 'b',
                  space = 0.5,
                  ratio = 5)
    plt.title(features[i] + ' - ' + features[j], fontsize = 18)
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签  
    plt.rcParams['axes.unicode_minus'] = False   # 用来正常显示负号

In [None]:
# 先初步划分数据集
train_x = train.drop(['USRID', 'FLAG', 'DAY'], axis = 1).values
train_y = train['FLAG'].values
del train

test_x = test.drop(['USRID', 'DAY'], axis = 1).values
UID = test[['USRID']]
del test

In [None]:
train_x.info()

In [None]:
train_y.info()

In [None]:
test_x.info()

In [None]:
UID.info()

# 3  XGBoost建模预测

In [None]:
import operator
import xgboost as xgb

3.1 保留所有特征

3.1.1 将训练集划分为X_dtrain, X_deval, y_dtrain, y_deval，训练xgboost模型

In [None]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import accuracy_score

In [None]:
# 所有列名
features = list(train_x.columns)

In [None]:
# 平衡数据处理
randomState = 10
np.random.seed(randomState)

X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train_x,
                                                        train_y,
                                                        test_size = 0.3,
                                                        random_state = randomState)

weights = ((train_y == 0).sum() / (1.0 * (train_y == 1).sum()))
print('平衡数据调整权重：', weights)

In [None]:
# 网格搜索
from sklearn.grid_search import GridSearchCV

# 需搜索的参数
tree_param_grid = {'n_estimators': list(range(300, 1100, 100)),      # 树的个数
                   'max_depth': list(range(3, 10))                   # 树的深度，越大越容易过拟合
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'min_child_weight': list(range(1, 10))            # min_child_weight = 1
                                                                     # 即叶子节点中最少需要包含 100 个样本
                                                                     # 参数值越小，越容易 overfitting
                   }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'subsample': list(np.linspace(0.5, 1, 6)),        # 每棵树随机采样的比例，通常0.5~1 
                   'colsample_bytree': list(np.linspace(0.5, 1, 6))  # 生成树时进行的特征采样占比，通常0.5~1
                   }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'reg_alpha': list(range(1, 10)),                  # 权重值的L1正则化项参数
                   'reg_lambda': list(range(1, 10))                  # 权重值的L2正则化项参数
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'seed': list(range(100, 1001, 100))               # 随机种子
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'seed': list(range(100, 1001, 100))               # 随机种子
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(X_dtrain, y_dtrain.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
# 训练XGBoost模型
model1 = xgb.XGBClassifier()
y_pred = model1.fit(X_dtrain, y_dtrain).predict(X_deval)
print('Accuracy : ', accuracy_score(y_deval, y_pred))
del y_pred

In [None]:
# 确定特征因子重要性
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 绘制重要性前 100 的特征重要性评分
colours = plt.cm.Set1(np.linspace(0, 1, 100))
fig = plt.figure(figsize = (16, 24))
ax = fig.add_subplot(111)
ax = xgb.plot_importance(model1,
                         max_number_features = 100,
                         height = 0.8,
                         color = colours,
                         grid = False,
                         show_values = True,
                         importance_type = 'cover',
                         ax = ax)
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(2)

ax.set_xlabel('重要性评分', size = 16)
ax.set_ylabel('特征', size = 16)
ax.set_yticklabels(ax.get_yticklabels(), size = 12)
ax.set_title('模型学习后特征重要性排序', size = 20)

In [None]:
# 模型重要性的决策树可视化
os.environ['PATH'] += os.pathsep + 'D:/Graphviz2.38/bin'
xgb.to_graphviz(model1)

In [None]:
# 学习曲线显示精度
trainSizes, trainScores, crossValScores = learning_curve(
    XGBClassifier(),
    X_dtrain,
    y_dtrain,
    cv = 5,
    scoring = 'accuracy'
)

In [None]:
trainScoresMean = np.mean(trainScores, axis = 1)
trainScoresStd = np.std(trainScores, axis = 1)
crossValScoresMean = np.mean(crossValScores, axis = 1)
crossValScoresStd = np.std(crossValScores, axis = 1)

fig = plt.figure(figsize = (12, 8))
plt.fill_between(trainSizes,
                 trainScoresMean - trainScoresStd,
                 trainScoresMean + trainScoresStd,
                 alpha = 0.1,
                 color = 'b')
plt.fill_between(trainSizes,
                 crossValScoresMean - crossValScoresStd,
                 crossValScoresMean + crossValScoresStd,
                 alpha = 0.1,
                 color = 'r')
plt.plot(trainSizes,
         trainScores.mean(axis = 1),
         'o-',
         label = 'TRAIN',
         color = 'b')
plt.plot(trainSizes,
         crossValScores.mean(axis = 1),
         'o-',
         label = 'CROSS-VAL',
         color = 'r')

ax = plt.gca()
for axis in ['top', 'bottom', 'left', 'right']:
    ax.spines[axis].set_linewidth(2)

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles,
           ['TRAIN', 'CROSS-VAL'],
           bbox_to_anchor = (0.8, 0.15),
           loc = 2,
           borderaxespad = 0,
           fontsize = 16)
plt.xlabel('TRAIN', size = 16)
plt.ylabel('AUC', size = 16)
plt.title('Learning Curves (all features and part samples)', size = 20)

In [None]:
del X_dtrain
del X_deval
del y_dtrain
del y_deval

In [None]:
# 输出结果
res_pred = model1.predict(test_x)
a = pd.DataFrame(res_pred, columns = ['RST'])
res = pd.concat([UID, a['RST']], axis = 1)
res.to_csv(path + '/result/all_feat_split_result.csv', index = None, sep = '\t')
del res_pred

3.1.2 用所有训练集训练xgboost模型

In [None]:
# 需搜索最优超参数
tree_param_grid = {'n_estimators': list(range(300, 1100, 100)),      # 树的个数
                   'max_depth': list(range(3, 10))                   # 树的深度，越大越容易过拟合
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(train_x, train_y.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'min_child_weight': list(range(1, 10))            # min_child_weight = 1
                                                                     # 即叶子节点中最少需要包含 100 个样本
                                                                     # 参数值越小，越容易 overfitting
                   }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(train_x, train_y.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'subsample': list(np.linspace(0.5, 1, 6)),        # 每棵树随机采样的比例，通常0.5~1 
                   'colsample_bytree': list(np.linspace(0.5, 1, 6))  # 生成树时进行的特征采样占比，通常0.5~1
                   }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(train_x, train_y.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'reg_alpha': list(range(1, 10)),                  # 权重值的L1正则化项参数
                   'reg_lambda': list(range(1, 10))                  # 权重值的L2正则化项参数
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(train_x, train_y.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
tree_param_grid = {'seed': list(range(100, 1001, 100))               # 随机种子
                  }
gsearch = GridSearchCV(
    estimator = xgb.XGBClassifier(silent = 1,
                                  scale_pos_weight = weights,
                                  n_jobs = -1,
                                  eval_metric = 'auc'
                                 ),
    param_grid = tree_param_grid,
    cv = 5,
    n_jobs = -1)

gsearch.fit(train_x, train_y.ravel())

# 打印每个搜索的分数
print('grid scores :\n')
for i in gsearch.grid_scores_:
    print(i)
# 打印最佳参数
print('\nbest params : ', gsearch.best_params_)
# 打印最高分数
print('\nbest score : ', gsearch.best_score_)

In [None]:
# 训练XGBoost模型
model2 = xgb.XGBClassifier()
model2.fit(train_x, train_y)
# 输出结果
res_pred = model2.predict(test_x)
a = pd.DataFrame(res_pred, columns = ['RST'])
res = pd.concat([UID, a['RST']], axis = 1)
res.to_csv(path + '/result/all_feat_result.csv', index = None, sep = '\t')
del res_pred

In [None]:
# 模型重要性的决策树可视化
os.environ['PATH'] += os.pathsep + 'D:/Graphviz2.38/bin'
xgb.to_graphviz(model1)

In [None]:
# 学习曲线显示精度
trainSizes, trainScores, crossValScores = learning_curve(
    XGBClassifier(),
    train_x,
    train_y,
    cv = 5,
    scoring = 'accuracy'
)

In [None]:
trainScoresMean = np.mean(trainScores, axis = 1)
trainScoresStd = np.std(trainScores, axis = 1)
crossValScoresMean = np.mean(crossValScores, axis = 1)
crossValScoresStd = np.std(crossValScores, axis = 1)

fig = plt.figure(figsize = (12, 8))
plt.fill_between(trainSizes,
                 trainScoresMean - trainScoresStd,
                 trainScoresMean + trainScoresStd,
                 alpha = 0.1,
                 color = 'b')
plt.fill_between(trainSizes,
                 crossValScoresMean - crossValScoresStd,
                 crossValScoresMean + crossValScoresStd,
                 alpha = 0.1,
                 color = 'r')
plt.plot(trainSizes,
         trainScores.mean(axis = 1),
         'o-',
         label = 'TRAIN',
         color = 'b')
plt.plot(trainSizes,
         crossValScores.mean(axis = 1),
         'o-',
         label = 'CROSS-VAL',
         color = 'r')

ax = plt.gca()
for axis in ['top', 'bottom', 'left', 'right']:
    ax.spines[axis].set_linewidth(2)

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles,
           ['TRAIN', 'CROSS-VAL'],
           bbox_to_anchor = (0.8, 0.15),
           loc = 2,
           borderaxespad = 0,
           fontsize = 16)
plt.xlabel('TRAIN', size = 16)
plt.ylabel('AUC', size = 16)
plt.title('Learning Curves (all features and all samples)', size = 20)

3.2 根据获得的统计数据feature_stats再提取特征

3.2.1 将训练集划分为X_dtrain, X_deval, y_dtrain, y_deval，训练xgboost模型

3.2.2 用所有训练集训练xgboost模型

3.3 利用开源已调参的模型预测结果

In [None]:
# 转化为xgboost需要格式
xgb_train = xgb.DMatrix(train_x, label = train_y)
xgb_test = xgb.DMatrix(test_x)

In [None]:
# 模型参数
params = {'booster': 'gbtree',
          'objective': 'rank:pairwise',  # 二分类的问题
          # 'gamma':0.1,                 # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2
          'max_depth': 5,                # 构建树的深度，越大越容易过拟合
          # 'lambda':2,                  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
          'subsample': 0.7,              # 随机采样训练样本
          'colsample_bytree': 0.7,       # 生成树时进行的列采样
          'min_child_weight': 3,         # 这个参数默认是 1，即每个叶子里面 h 的和至少是多少
                                         # 对正负样本不均衡时的 0-1 分类而言, 假设 h 在 0.01 附近
                                         # min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本
                                         # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值
                                         # 该参数值越小，越容易 overfitting
          'silent': 0,                   # 设置成1则没有运行信息输出，最好是设置为0
          'eta': 0.03,                   # 如同学习率
          'eval_metric': 'auc'           # 评价方式
         }

plst = list(params.items())
num_rounds = 500  # 迭代次数
watchlist = [(xgb_train, 'train')]

In [None]:
# 训练模型并预测
model1 = xgb.train(plst, xgb_train, num_rounds, watchlist)
xgb_pred = model1.predict(xgb_test)

In [None]:
a = pd.DataFrame(pred_value, columns = ['RST'])
res = pd.concat([UID, a['RST']], axis = 1)
res.to_csv(path + '/result/exist_result.csv', index = None, sep = '\t')