In [None]:
import pandas as pd

data = pd.read_csv('../data/clear_data.csv')

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import time
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score


def print_precison_recall_f1(y_true, y_pred):
    # 准确率
    acc = accuracy_score(y_true, y_pred)
    # 召回率
    recall  = recall_score(y_true, y_pred, average='macro')
    # F值
    f1 = f1_score(y_true, y_pred, average='macro')
    # CH分数
    auc = roc_auc_score(y_true, y_pred)
    # 精准率
    p = precision_score(y_true, y_pred, average='macro')
    # print("ACC:{} Recall:{} f1:{} AUC:{} Precision: {} ".format(acc, recall,f1,auc,p))
    return [acc, recall, f1, auc, p]


In [None]:
precision_score([1], [1], average='macro')


In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import time
X  = data
y = 70*[0]+30*[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)
time_start = time.time()
lgbparams = {'num_leaves': 60,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
            'min_data_in_leaf': 30,
            'objective': 'binary',  # 定义的目标函数
            'max_depth': -1,
            'learning_rate': 0.03,
            "min_sum_hessian_in_leaf": 6,
            "boosting": "gbdt",
            "feature_fraction": 0.9,  # 提取的特征比率
            "bagging_freq": 1,
            "bagging_fraction": 0.8,
            "bagging_seed": 11,
            "lambda_l1": 0.1,  # l1正则
            # 'lambda_l2': 0.001,     #l2正则
            "verbosity": -1,
            "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
            'metric': {'binary_logloss', 'auc'},  # 评价函数选择
            "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
            # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
            }
train_set = lgb.Dataset(X_train, y_train)
val_set = lgb.Dataset(X_test, y_test)

lgbmodel = lgb.train(lgbparams, train_set, num_boost_round=3000,
                     valid_sets=(train_set, val_set),
                     early_stopping_rounds=500,
                     verbose_eval=False)
    
pred = lgbmodel.predict(X_test, predict_disable_shape_check=True)
time_end = time.time()
print('time = {}s'.format(time_end-time_start))
print_precison_recall_f1(y_test, np.around(pred))

In [None]:
time_start = time.time()
xgbparams =  {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'max_depth': 4,
                      'lambda': 10,
                      'subsample': 0.75,
                      'colsample_bytree': 0.75,
                      'min_child_weight': 2,
                      'eta': 0.025,
                      'seed': 0,
                      'nthread': 8,
                      'silent': 1}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)
watchlist = [(dtrain, 'train')]
bst = xgb.train(xgbparams, dtrain, num_boost_round=5, evals=watchlist)
pred = bst.predict(dtest)
time_end = time.time()
print('time = {}s'.format(time_end-time_start))
print_precison_recall_f1(y_test, np.around(pred))


In [None]:
time_start = time.time()
catmodel = CatBoostRegressor(
    iterations=3000, learning_rate=0.03,
    depth=7,
    l2_leaf_reg=4,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=2021)
catmodel2 = catmodel.fit(X_train, y_train, verbose=False)
pred = catmodel2.predict(X_test)

time_end = time.time()
print('time = {}s'.format(time_end-time_start))
print_precison_recall_f1(y_test, np.around(pred))


# 输出100次实验的评价指标

In [None]:
df_list = []
for i in range(100):
    time_start = time.time()
    xgbparams = {'booster': 'gbtree',
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
                'max_depth': 4,
                'lambda': 10,
                'subsample': 0.75,
                'colsample_bytree': 0.75,
                'min_child_weight': 2,
                'eta': 0.025,
                'seed': 0,
                'nthread': 8,
                'silent': 1}
    X = data
    Y = 70*[0]+30*[1]
    dtrain = xgb.DMatrix(X, label=Y)
    # dtest = xgb.DMatrix(X)
    watchlist = [(dtrain, 'train')]
    bst = xgb.train(xgbparams, dtrain, num_boost_round=5, evals=watchlist)
    pred = bst.predict(dtrain)
    time_end = time.time()
    print('time = {}s'.format(time_end-time_start))
    result = print_precison_recall_f1(Y, np.around(pred))
    result.append((time_end-time_start)*1000)
    df_list.append(result)
    

In [None]:
df_result = pd.DataFrame(df_list, columns=['准确率%', '召回率%', 'F1值', 'AUC', '精准率%', '耗时/ms'])  # [acc, recall, f1, auc, p]

In [None]:
df_data = df_result[['准确率%', '召回率%','耗时/ms', 'F1值', 'AUC', '精准率%']]
df_data


In [None]:
df_data.to_excel('./submit/附表4.xlsx',index=0)
