In [None]:
import random
import numpy as np
from tqdm import tqdm
from collections import defaultdict

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn.model_selection import KFold

from sklearn.preprocessing import QuantileTransformer

# 自定义py文件
from utils import *

## 参数设定 & 加载必要数据

In [None]:
N_splits = 5
seed = 42
random.seed(seed)
np.random.seed(seed)

samples = np.array(load_pickle("data/v3/processed/samples_feature.pkl"))
random.shuffle(samples)
features, labels = zip(*samples)
features = np.array(features, dtype='float')
labels = np.array(labels, dtype='int')

features.shape

In [None]:
qt = QuantileTransformer(n_quantiles=80,output_distribution='normal')

# 按方差或最终效果剔除某些特征 78 -6 =72
remove_features = [0,2,20,24,44,76]
features = np.delete(features, remove_features, axis=1)

# 这里用QuantileTransformer来进行scaling
transformed_features = qt.fit_transform(features)

In [None]:
# 根据相关性和特征重要性进行特征交叉 16
cross_features = np.array([
    transformed_features[:,36] * transformed_features[:,10],
    transformed_features[:,1] * transformed_features[:,5],
    transformed_features[:,4] * transformed_features[:,6],
    transformed_features[:,35] * transformed_features[:,32],
    transformed_features[:,37] * transformed_features[:,39],
    transformed_features[:,3] * transformed_features[:,8],
    transformed_features[:,50] * transformed_features[:,52],
    transformed_features[:,40] * transformed_features[:,38],
    transformed_features[:,40] * transformed_features[:,41],
    transformed_features[:,50] * transformed_features[:,53],
    transformed_features[:,41] * transformed_features[:,34],
    transformed_features[:,7] * transformed_features[:,8],
    transformed_features[:,34] * transformed_features[:,37],
    transformed_features[:,22] * transformed_features[:,23],
    transformed_features[:,3] * transformed_features[:,7],
    transformed_features[:,18] * transformed_features[:,10],
])

# 72 + 16 = 88
features = np.hstack([transformed_features, cross_features.T])

In [None]:
features.shape

In [None]:
qt_filename = "data/v3/processed/qt.pkl"
dump_pickle(qt_filename, qt)

## 模型定义

In [None]:
def create_models():
    # Randomforest
    rf = RandomForestClassifier(n_estimators=1000,
                                criterion='entropy',
                                max_features = 25,
                                bootstrap=True,
                                random_state=42,
                                warm_start=False,
                                class_weight=None,
                                n_jobs=-1,
                                )
    # XGBoost
    xgb = XGBClassifier(
                         n_estimators = 1000,
                         booster = 'gbtree',
                         max_depth = 10, 
                         #objective = 'binary:logistic',
                         reg_lambda = 1,
                         subsample = 0.5,
                         gamma = 0.5,
                         colsample_bytree = 0.75,
                         min_child_weight = 2,
                         learning_rate  = 0.25,
                         n_jobs = -1,
                         random_state = 42
                        )
    # lightGBM
    lgbm = LGBMClassifier(
                      max_depth=5, 
                      learning_rate=0.1, 
                      n_estimators=1000, 
                      objective='binary',
                      subsample=0.8,
                      n_jobs=-1,
                      num_leaves=30,
                      colsample_bytree = 0.75,
                      random_state = 42
                     )
    #catboost
    cat = CatBoostClassifier(
                      iterations=1000,
                      learning_rate=0.1,
                      max_depth=7,
                      verbose=100,
                      task_type='CPU',
                      eval_metric='AUC',
                      random_state=42,
                      thread_count=-1,  
                    )
    
    return rf,xgb,lgbm,cat

## 加载数据

In [None]:
mean = features.mean()
std = features.std()
# 简单归一化
features = (features - mean) / std

In [None]:
# 保存mean和std
mean_path = "data/v3/processed/mean.pkl"
std_path = "data/v3/processed/std.pkl"
dump_pickle(mean_path, mean)
dump_pickle(std_path, std)

In [None]:
features.shape

## 划分样本  种子固定 5折交叉验证

In [None]:
%%time
model_save_path = "data/v3/processed/models/"

#%%time
xSample = features
ySample = labels

rf_preds_all = []
xgb_preds_all = []
lgbm_preds_all = []
cat_preds_all = []

metrics_dict = {"rf": {}, "xgb": {}, "lgbm": {}, "cat": {}}
for model in metrics_dict.keys():
    metrics_dict[model] = {"AUC":0.0, "ACC":0.0, "Recall":0.0, "F1-score":0.0, "Precesion":0.0}

kf = KFold(n_splits = N_splits)
num = 0 
for train_index, valid_index in kf.split(xSample):
    
    train_X, train_y = xSample[train_index], ySample[train_index]
    valid_X, valid_y = xSample[valid_index], ySample[valid_index]
    
    
    rf,xgb,lgbm,cat = create_models()
    
    rf.fit(train_X, train_y)
    xgb.fit(train_X, train_y)
    lgbm.fit(train_X, train_y)
    cat.fit(train_X, train_y)
    
    rf_preds = rf.predict_proba(valid_X)[:,1]
    xgb_preds = xgb.predict_proba(valid_X)[:,1]
    lgbm_preds = lgbm.predict_proba(valid_X)[:,1]
    cat_preds = cat.predict_proba(valid_X)[:,1]
    
    rf_preds_all.append(rf_preds)
    xgb_preds_all.append(xgb_preds)
    lgbm_preds_all.append(lgbm_preds)
    cat_preds_all.append(cat_preds)
    
    
    rf_pred_labels = (rf_preds >= 0.5) * 1
    xgb_pred_labels = (xgb_preds >= 0.5) * 1
    lgbm_pred_labels = (lgbm_preds >= 0.5) * 1
    cat_pred_labels = (cat_preds >= 0.5) * 1
    
    for model in metrics_dict.keys():
        metrics_dict[model]["AUC"] += metrics.roc_auc_score(valid_y, eval(f'{model}_pred_labels')) / N_splits
        metrics_dict[model]["ACC"] += metrics.accuracy_score(valid_y, eval(f'{model}_pred_labels')) / N_splits
        metrics_dict[model]["Recall"] += metrics.recall_score(valid_y, eval(f'{model}_pred_labels')) / N_splits
        metrics_dict[model]["F1-score"] += metrics.f1_score(valid_y, eval(f'{model}_pred_labels')) / N_splits
        metrics_dict[model]["Precesion"] += metrics.precision_score(valid_y, eval(f'{model}_pred_labels')) / N_splits
    
    
    dump_pickle(model_save_path + f"model_{num}_rf.dat", rf)
    dump_pickle(model_save_path + f"model_{num}_xgb.dat", xgb)
    dump_pickle(model_save_path + f"model_{num}_lgbm.dat", lgbm)
    dump_pickle(model_save_path + f"model_{num}_cat.dat", cat)
    
    del rf,xgb,lgbm,cat
    _ = gc.collect()
    
    num = num + 1