In [299]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [300]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    print(df['label'].value_counts())
    data = np.array(df.iloc[:100, [0, 1, -1]])
    for i in range(len(data)):
        if data[i, -1] == 0:
            data[i, -1] = -1
    return data[:, :2], data[:, -1]

In [301]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

2    50
1    50
0    50
Name: label, dtype: int64


In [302]:
clf = GradientBoostingClassifier(n_estimators=10)
clf.fit(X_train, y_train)
# 参数含义：
# criterion：样本切分策略，friedman_mse是最小平方误差的近似
# subsample：选取多少数据进行boosting
# loss：对数似然损失deviance和指数损失函数exponential（相当于Adaboost）
# n_estimators：基分类器数量
y_pred = clf.predict(X_test)

In [303]:
f1_score(y_test, y_pred)

1.0

## 参数调优

In [304]:
# 参数调整：
# 1、n_estimators and learning rate
# 2、max_depth and min_samples_split

In [305]:
param_grid = {
    'n_estimators':range(100, 801, 200),
    'min_samples_split':range(800, 1900, 200),
    'min_samples_leaf':range(60, 101, 10)
}
grid_search = GridSearchCV(clf, param_grid, verbose=1)
grid_search.fit(X_train, y_train)
best_param = grid_search.best_estimator_.get_params()
print(grid_search.best_params_)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
{'min_samples_leaf': 60, 'min_samples_split': 800, 'n_estimators': 100}


[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:   19.9s finished


In [306]:
clf = GradientBoostingClassifier(n_estimators=100)