https://github.com/scikit-learn/scikit-learn/blob/55bf5d9/sklearn/ensemble/weight_boosting.py#L295

In [1]:
import numpy as np
from adacost import AdaCostClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import *


# weights=[0.9, 0.1]表示类标分别取0、1的比例
X_, y_ = make_classification(n_samples=5000, n_classes=2, n_features=10, 
                           n_informative=8, weights=[0.9, 0.1], random_state=2018)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=2018)

print('Train:Test = {:}:{:}'.format(len(X_train), len(X_test)))

ratio = sum(y_train==0)/sum(y_train==1)
print('Train_Pos:Train_Neg = {:}:{:} = {:.2f}'.format(sum(y_train==0), sum(y_train==1), ratio))

Train:Test = 4000:1000
Train_Pos:Train_Neg = 3589:411 = 8.73


In [2]:
clf_ada = AdaBoostClassifier(random_state=2018)
clf_ada.fit(X_train, y_train)

y_pred = clf_ada.predict(X_test)
y_pred_prob = clf_ada.predict_proba(X_test)[:,-1]
auc1 = roc_auc_score(y_test, y_pred_prob)
f1_score1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)
print('AUC:{:.6f}, F1_score:{:.6f}, Recall:{:.6f}, Acc:{:.6f}'.format(auc1, f1_score1, recall, acc))
con_matrix

AUC:0.755450, F1_score:0.308725, Recall:0.203540, Acc:0.897000


array([[874,  13],
       [ 90,  23]], dtype=int64)

In [3]:
clf_ada = AdaCostClassifier(FNcost=1.8, FPcost=1)
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_test)
y_pred_prob = clf_ada.predict_proba(X_test)[:,-1]


auc1 = roc_auc_score(y_test, y_pred_prob)
f1_score1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)
print('AUC:{:.6f}, F1_score:{:.6f}, Recall:{:.6f}, Acc:{:.6f}'.format(auc1, f1_score1, recall, acc))
con_matrix

AUC:0.757849, F1_score:0.468468, Recall:0.460177, Acc:0.882000


array([[830,  57],
       [ 61,  52]], dtype=int64)

In [4]:
clf_ada = AdaCostClassifier(FNcost=ratio, random_state=2018)
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_test)
y_pred_prob = clf_ada.predict_proba(X_test)[:,-1]

auc1 = roc_auc_score(y_test, y_pred_prob)
f1_score1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)
print('AUC:{:.6f}, F1_score:{:.6f}, Recall:{:.6f}, Acc:{:.6f}'.format(auc1, f1_score1, recall, acc))
con_matrix

AUC:0.721399, F1_score:0.207743, Recall:0.973451, Acc:0.161000


array([[ 51, 836],
       [  3, 110]], dtype=int64)

In [5]:
import time
from sklearn.model_selection import GridSearchCV
def Grid(model, param, X_train=X_train, y_train=y_train, scoring='roc_auc', cv=3):
    start = time.time()
    grid = GridSearchCV(model, param_grid=param, cv=cv, scoring=scoring, iid=False, n_jobs=4)
    grid.fit(X_train, y_train)
    end = time.time()
    searchtime = end - start
    return grid.best_estimator_, grid.best_score_, grid.best_params_, searchtime

In [6]:
AdaCost_grid = AdaCostClassifier(random_state=2018)
param = {'FNcost':np.linspace(1.1, ratio, 3), 'FPcost':np.linspace(0.5, 1, 3)}

grid_result = Grid(AdaCost_grid, param, scoring='recall')
best_model = grid_result[0]
print(grid_result[1], grid_result[2], grid_result[3])

y_pred = grid_result[0].predict(X_test)
y_pred_prob = grid_result[0].predict_proba(X_test)[:,-1]
auc = roc_auc_score(y_test, y_pred_prob)
f1_score1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)
print('AUC:{:.6f}, F1_score:{:.6f}, Recall:{:.6f}, Acc:{:.6f}'.format(auc, f1_score1, recall, acc))

0.9902676399026764 {'FNcost': 8.732360097323602, 'FPcost': 1.0} 52.454869508743286
AUC:0.721399, F1_score:0.207743, Recall:0.973451, Acc:0.161000
