In [2]:
import pandas as pd 
import numpy as np
from tqdm import tqdm
import os
import sys

In [2]:
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))

In [13]:
os.chdir('../00_data/')

In [14]:
ls

atec_anti_fraud_test_b.csv       df_50w.csv        submit_0212C.csv
atec_anti_fraud_test_b_demo.csv  df_test_50w.csv   submit_0212D.csv
atec_anti_fraud_train.csv        df_test.zip       train_head_10w.csv
atec_anti_fraud_train_demo.csv   submit_0211A.csv  train_modified.csv
df_10w.csv                       submit_0212A.csv
df_1m.csv                        submit_0212B.csv


# Load data and split train/val

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# 设置抑制warning 输出
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [11]:
# df = pd.read_csv('./df_10w.csv')
# X = df.iloc[:,3:]  # start from f1
# y = df.iloc[:,1]     # label

In [12]:
# 划分训练和测试集
# test_ratio = 0.2   # 根据样本总量调整
# X_train, X_test, y_train, y_test = \
#     train_test_split(X, y, test_size=test_ratio, 
#                      stratify=y, random_state=0)

# PCA

- 利用PCA对数据降维，get_pca_order输出需要保留的主成分阶数
- 试用效果不好，最终未采用

In [3]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

In [4]:
# def get_pca_order(X):
#     # 计算累计方差0.90 0.95 对应的pca阶数
#     sc = StandardScaler()
#     pca = PCA()
#     X_std = sc.fit_transform(X)
#     X_pca = pca.fit_transform(X_std)
#     pca_sum = np.cumsum(pca.explained_variance_ratio_)

#     print('前{}阶主成分,累计解释方差={}'.format(sum(pca_sum < 0.9),0.9))
#     print('前{}阶主成分,累计解释方差={}'.format(sum(pca_sum < 0.95),0.95))

In [5]:
# get_pca_order(X_train)

# GBDT

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [9]:
# 基本实现
# pipe_gb = make_pipeline(StandardScaler(),
#                         GradientBoostingClassifier(random_state=10))
# pipe_gb.fit(X_train, y_train)

# y_train_pred = pipe_gb.predict(X_train)
# y_test_pred = pipe_gb.predict(X_test)

# Model evaluation util

In [18]:
from sklearn.metrics import roc_curve, auc

In [21]:
def show_metric(X_data, y_gt, pipe):
    # 输出metric 评估结果
    y_pred = pipe_gb.predict(X_data)
    acc = pipe.score(X_data, y_gt)
    recall = metrics.recall_score(y_gt.values, y_pred,  average=None)
    cnf_mat = metrics.confusion_matrix(y_gt.values, y_pred)
    f1_score = metrics.f1_score(y_gt.values, y_pred)
    
    print('metrcis：')
    print('Accuracy: %.3f' % acc)
    print('Recall of classes: ', recall)
    print('f1 score \t: ', f1_score)
    print('confusion matrix:')
    print(cnf_mat)

In [19]:
def get_tpr(X_data, y_gt, pipe):
    # 切阈值方法
    # TPR1：当FPR等于0.001时的TPR
    # TPR2：当FPR等于0.005时的TPR
    # TPR3：当FPR等于0.01时的TPR
    # 模型成绩 = 0.4 * TPR1 + 0.3 * TPR2 + 0.3 * TPR3
    
    probas = pipe.predict_proba(X_data)
    fpr, tpr, thresholds = roc_curve(y_gt, probas[:, 1],pos_label=1)

    fpr_dic = {'r1':0.001, 'r2':0.005, 'r3':0.01}
    tpr_dic = {}
    
    for k, fpr_thres in fpr_dic.items():
        index = np.argmin(abs(fpr - fpr_thres))
        tpr_dic[k] = tpr[index]

    final_score = 0.4*tpr_dic['r1'] + 0.3*tpr_dic['r2'] + 0.3*tpr_dic['r3']

    print('tpr at each level: ')
    print(tpr_dic)
    print('final score \t: {} '.format(final_score))
    return tpr_dic, final_score

# Grid Search

- 用10w训练数据进行Grid Search
- Search参数：max_depth n_estimators

In [None]:
from sklearn.model_selection import GridSearchCV

In [15]:
df_all = pd.read_csv('./df_1m.csv') # 训练全集
df = df_all.sample(100000, random_state=0, replace=False) # 选取10w子集
del df_all  # 释放内存

In [None]:
# 划分训练和测试集
X_train, X_test, y_train, y_test = \
    train_test_split(df.iloc[:,3:], 
                     df.iloc[:,1], 
                     test_size=0.3, 
                     stratify=y, random_state=0)

In [None]:
# grid search 不使用pipeline，此处需要先计算X的标准化值
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

In [None]:
param_grid = {'n_estimators':range(100,201,50), # 取值 100 150 200
              'max_depth': range(3,7,1)} # 取值3 4 5 6，总计12种组合
gs = GridSearchCV(
  estimator=GradientBoostingClassifier(random_state=10), 
                  param_grid=param_grid, 
                  scoring='f1',   # 参考f1指标选优
                  cv=3,           # 3折交叉验证
                  n_jobs=-1)      # 设置多线程训练

In [None]:
gs = gs.fit(X_train_std, y_train)
print(gs.best_score_)
print(gs.best_params_)  # 输出最优参数

# 正式训练

- 用更大的训练集训练模型
- 输出模型的性能指标

In [None]:
df = pd.read_csv('./df_1m.csv') # 训练全集

In [None]:
# 划分训练和测试集
X_train, X_test, y_train, y_test = \
    train_test_split(df.iloc[:,3:], 
                     df.iloc[:,1], 
                     test_size=0.3, 
                     stratify=y, random_state=0)
del df

In [None]:
# 此处参数替换为搜索的最优参数
pipe_gb = make_pipeline(StandardScaler(),
                        GradientBoostingClassifier(max_depth=4,
                                                   n_estimators=200， 
                                                   tol=0.0001,
                                                   verbose=10, # 输出详细信息
                                                   random_state=10))

In [None]:
pipe_gb.fit(X_train, y_train)  # 模型训练

## 输出分类指标

In [None]:
y_train_pred = pipe_gb.predict(X_train)  # 计算预测值
show_metric(X_train, y_train, pipe_gb)

In [None]:
y_test_pred = pipe_gb.predict(X_test)

## 输出比赛score

- 对比训练集和验证集的score可以判断过拟合的程度

In [None]:
p = get_tpr(X_train, y_train, pipe_gb)

In [None]:
p = get_tpr(X_test, y_test, pipe_gb)

# Infer test dataset

- 用GBDT模型对test数据预测
- 输出可提交格式的csv文件

In [28]:
df_test = pd.read_csv('./df_test.csv')

In [29]:
def infer_testset_save(df):
    # 读取测试集，用模型infer概率，输出提交结果
    test_dim = df.shape[0]
    X_test2 = df.iloc[:,2:]
    test_score = pipe_gb.predict_proba(X_test2)
    assert(test_score.shape[0] == test_dim)

    df_submit = pd.DataFrame()
    df_submit['id'] = df_test.loc[:,'id']
    df_submit['score'] = test_score[:,1]

    assert(df_submit.shape[0] == test_dim)
    return df_submit

In [30]:
df_submit = infer_testset_save(df_test)

In [31]:
df_submit.to_csv('./submit_0212A.csv', index=False)