In [502]:
import pandas as pd
import numpy as np
from random import shuffle
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

In [503]:
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 解决多分类问题

In [504]:
clf = OneVsRestClassifier(XGBClassifier())
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constraints=None,
       learning_rate=None, max_delta_step=None, max_depth=None,
       min_child_w..._pos_weight=None, subsample=None,
       tree_method=None, validate_parameters=None, verbosity=None),
          n_jobs=1)

In [505]:
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='micro'))

0.9333333333333333


## 解决二分类问题

In [506]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    data = np.array(df.iloc[:100, [0, 1, -1]])
    shuffle(data)
    for i in range(len(data)):
        if data[i, -1] == 0:
            data[i, -1] = 1
    return data[:, :2], data[:, -1]

In [507]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [508]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [509]:
# 参数设置：
# （1）learning_rate：通常设置为0.01-0.1，减少同时增加树的数量
#     n_estimators：模型中基模型的数量
# （2）max_depth：树的最大深度，可以尝试range(3, 10, 2)
#     min_child_weight：结点分裂后所有结点权重之和大于可继续分裂，range(1, 6, 2)
# （3）gamma：最小划分损失，划分之后损失函数降低的值，大于gamma继续分裂
# （4）subsamples：训练样本采样比例，range(0.6, 0.9)
#     colsample_bytree：对特征的采样比例, range(0.6, 0.9)
# 过拟合解决方法：
# （1）降低模型复杂度：树的深度，结点分裂后权重和，最小划分损失
# （2）增加模型的随机性：增加随机选择样本和特征的比例
# （3）直接减小学习率，同时增加树的个数

In [510]:
param_grid = {
    'learning_rate':np.arange(0.01, 0.1, 0.01),
    'n_estimators':range(3, 10, 2)
}
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_param = grid_search.best_estimator_.get_params()
print(grid_search.best_params_)

{'learning_rate': 0.01, 'n_estimators': 3}


In [511]:
clf = XGBClassifier(
    learning_rate=best_param['learning_rate'],
    n_estimators=best_param['n_estimators']
)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.01, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=3, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [512]:
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred))

1.0
