# 自训练半监督

https://blog.csdn.net/vivian_ll/article/details/103494042?utm_medium=distribute.pc_relevant.none-task-blog-title-2&spm=1001.2101.3001.4242

下面以XGBoost作为基分类器为例，自己写一个自训练半监督算法：

数据全部是有标注的，从中随机挑选一定比例（ratio）的数据丢弃标注，作为无标注数据。

In [None]:
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score
from sklearn.externals import joblib

import warnings
warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser(description='BMD prediction')
parser.add_argument('--classifier-name', type=str, default='xgboost', metavar='S',
                    help='model name')
parser.add_argument('--dataset', type=str, default='../data/newdata/largedata.csv', metavar='DATASET',
                    help='dataset path')
parser.add_argument('--feature-columns', nargs='+', default=['sex','year','height','weight'],
                    help='columns of features')
parser.add_argument('--label-column', type=str, default='label',
                    help='column of label')
parser.add_argument('--cat-numbers', type=int, default=2,
                    help='Number of categories')
parser.add_argument('--model-name', type=str, default='xgboost', choices=['xgboost','svm'],
                    help='name of classifier')
parser.add_argument('--save-path', type=str, default='../model/classifiers/xgboost_label_gridSearch.joblib.dat',
                    help='path of saved model')
parser.add_argument('--grid-search', type=bool, default=True,
                    help='adjust parameters with grid search')

def read_data(args):
    path = args.dataset
    data = pd.read_csv(path,encoding='gbk')

    # 获取label列非空的行
    for i in range(len(data)):
        if np.isnan(data[args.label_column][i]):
            data = data.drop(i)
    # data.to_csv('data.csv')
    # data.info()
    return data

def split(args,data):
    features = data[args.feature_columns]
    label = data[args.label_column]
    print(args.feature_columns)
    print(args.label_column)

    X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2, random_state=77)

    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)

    return X_train,X_test,y_train,y_test

def selfTraining(args, X_train, X_test, y_train, y_test):
    ratio = 0.1  # 缺失值比例
    rng = np.random.RandomState(10)  # 产生一个随机状态种子

    YSemi_train = np.copy(y_train)
    YSemi_train[rng.rand(len(y_train)) < ratio] = -1    # rng.rand()返回一个或一组服从“0~1”均匀分布的随机样本值

    unlabeledX = X_train[YSemi_train == -1, :]
    YTrue = y_train[YSemi_train == -1]

    idx = np.where(YSemi_train != -1)[0]   # np.where返回一个(array([   3,   10,   12, ..., 8042, 8050, 8053], dtype=int64),)
    labeledX = X_train[idx, :]
    labeledY = YSemi_train[idx]

    model = XGBClassifier(
        learning_rate=0.02,
        n_estimators=90,
        max_depth=4,
        min_child_weight=1,
        gamma=0,
        subsample=1,
        colsample_bytree=1,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    )

    # 全监督
    clf = model.fit(X_train, y_train)
    score = accuracy_score(y_train, clf.predict(X_train))
    test_score = accuracy_score(y_test, clf.predict(X_test))   # 测试集准确率
    auc_score = roc_auc_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    print("Training data size=", len(y_train), "Labeld accuracy= %.4f" % score, " , Unlabeled ratio=", 0,
          "ACC: %.4f" % test_score, 'AUC: %.4f' % auc_score, 'F1-score: %.4f' % f1)

    # 半监督
    clf = model.fit(labeledX, labeledY)
    unlabeledY = clf.predict(unlabeledX)
    print(unlabeledY)
    unlabeledProb = clf.predict_proba(unlabeledX).max(axis=1)  # 预测为0和1的置信度，取大的
    print(unlabeledProb)
    ratioInitial = 1 - (len(labeledY) / len(y_train))
    score = accuracy_score(labeledY, clf.predict(labeledX))
    test_score = accuracy_score(y_test, clf.predict(X_test))
    auc_score = roc_auc_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    print("iteration=",0, "Training data size=", len(labeledY), "Labeld accuracy=:  %.4f" % score, " , Unlabeled ratio=:  %.4f" % ratioInitial, "ACC: %.4f" % test_score, 'AUC: %.4f' % auc_score, 'F1-score: %.4f' % f1)

    max_iter = 500
    probThreshold = 0.8

    unlabeledXOrg = np.copy(unlabeledX)
    YTrueOrg = np.copy(YTrue)

    rr = []
    it = []

    i = 0
    repeat = 1


    while (i < max_iter and score > 0.01 and repeat<=10):

        lastscore = score
        ratio = 1 - (len(labeledY) / len(y_train))

        rr.append(ratio)
        it.append(i)

        labelidx = np.where(unlabeledProb > probThreshold)[0]
        unlabelidx = np.where(unlabeledProb <= probThreshold)[0]

        labeledX = np.vstack((labeledX, unlabeledX[labelidx, :]))  # 按照行顺序把数组给堆叠起来
        labeledY = np.hstack((labeledY, unlabeledY[labelidx]))
        unlabeledX = unlabeledX[unlabelidx, :]
        YTrue = y_train[unlabelidx]

        clf = model.fit(labeledX, labeledY)
        score = accuracy_score(labeledY, clf.predict(labeledX))
        test_score = accuracy_score(y_test, clf.predict(X_test))
        auc_score = roc_auc_score(y_test, clf.predict(X_test))
        f1 = f1_score(y_test, clf.predict(X_test))
        print("iteration=",i+1, "Training data size=", len(labeledY), "Labeld accuracy= %.4f" % score, " , Unlabeled ratio= %.4f" % ratio, "ACC: %.4f" % test_score, 'AUC: %.4f' % auc_score, 'F1-score: %.4f' % f1)

        unlabeledY = clf.predict(unlabeledX)
        unlabeledProb = clf.predict_proba(unlabeledX).max(axis=1)

        i += 1
        if lastscore == score:
            repeat += 1
        else:
            repeat = 1

def main(args):

    data = read_data(args)
    X_train, X_test, y_train, y_test = split(args,data)

    selfTraining(args, np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test))

if __name__ == '__main__':

    main(parser.parse_args())