In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.datasets import fetch_20newsgroups
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
categories = [
             'rec.autos',
             'talk.politics.guns',
             'talk.politics.mideast',
             'rec.sport.baseball',
             'comp.sys.mac.hardware',
             'soc.religion.christian']

newsgroup_train = fetch_20newsgroups(subset = 'all',categories = categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
rng = np.random.RandomState(0)
indices = np.arange(len(newsgroup_train.target))
rng.shuffle(indices)

In [4]:
vectorizer = TfidfVectorizer(   stop_words = 'english',
                                max_df = 0.65,
                                ngram_range=(1,2),
                                max_features=15000)

fea_train = vectorizer.fit_transform(newsgroup_train.data)
y_train = newsgroup_train.target

In [5]:
test_num = 2000

X = fea_train[indices[:test_num ]]
y = y_train[indices[:test_num ]]
images = np.array(newsgroup_train.data)[indices[:test_num]]

n_total_samples = len(y)
n_labeled_points = 300
max_iterations = 20

unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]

In [7]:
unlabeled_indices

array([ 300,  301,  302, ..., 1997, 1998, 1999])

In [10]:
for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("没有待打标的候选标签项")
        break
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    lp_model = LabelSpreading(
                        gamma=0.25, 
                        kernel='knn',
                        alpha = 0.5,
                        n_neighbors =15,
                        max_iter=50,
                        n_jobs = -1
                        )
    lp_model.fit(X.toarray(), y_train)

    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

    print("【迭代轮次】 %i %s" % (i, 70 * "_"))
    print("LabelSpreading model: %d 个已标记 & %d 个未标记 (%d 个总数)"
          % (n_labeled_points, n_total_samples - n_labeled_points,
             n_total_samples))

    print(classification_report(
        true_labels, 
            predicted_labels,
            target_names = [
                     'rec.autos',
                     'talk.politics.guns',
                     'talk.politics.mideast',
                     'rec.sport.baseball',
                     'comp.sys.mac.hardware',
                     'soc.religion.christian']
            ))

    print("【混淆矩阵】")
    print(cm)

    # compute the entropies of transduced label distributions
    pred_entropies = stats.distributions.entropy(
        lp_model.label_distributions_.T)

    # select up to 10 digit examples that the classifier is most uncertain about
    uncertainty_index = np.argsort(pred_entropies)[::-1]
    uncertainty_index = uncertainty_index[
        np.in1d(uncertainty_index, unlabeled_indices)][:10]

    # keep track of indices that we get labels for
    delete_indices = np.array([], dtype=int)


    print('【最不确定样本呈现】\n',images)
    for index, image_index in enumerate(uncertainty_index):
        image = images[image_index]


        if i < max_iterations:

            print('……………'*5)
            print("预测标签: {}\n真实标签: {}".format(
                newsgroup_train.target_names[lp_model.transduction_[image_index]], newsgroup_train.target_names[y[image_index]]))
            print('******************'*5)

        # labeling 10 points, remote from labeled set
        delete_index, = np.where(unlabeled_indices == image_index)
        delete_indices = np.concatenate((delete_indices, delete_index))

    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
    n_labeled_points += len(uncertainty_index)
    print('=========第 {} 轮结束~============'.format(i))

【迭代轮次】 0 ______________________________________________________________________
LabelSpreading model: 300 个已标记 & 1700 个未标记 (2000 个总数)
                        precision    recall  f1-score   support

             rec.autos       0.72      0.74      0.73       282
    talk.politics.guns       0.75      0.77      0.76       281
 talk.politics.mideast       0.78      0.73      0.75       281
    rec.sport.baseball       0.87      0.85      0.86       319
 comp.sys.mac.hardware       0.84      0.81      0.82       268
soc.religion.christian       0.83      0.91      0.87       269

              accuracy                           0.80      1700
             macro avg       0.80      0.80      0.80      1700
          weighted avg       0.80      0.80      0.80      1700

【混淆矩阵】
[[209  18  20  13   5  17]
 [ 30 216  13  10   7   5]
 [ 17  29 204   8  13  10]
 [ 11  14   7 270   9   8]
 [ 14   9  12   8 216   9]
 [  8   2   7   0   8 244]]
【最不确定样本呈现】
 ["From: Peter Hansen <pgmoffc@BNR.ca>\nSu